diff options
Diffstat (limited to 'fs')
529 files changed, 20722 insertions, 15551 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 3e68521f4e2f..399d455d50d6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -669,8 +669,8 @@ v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir, * */ -static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int err; u32 perm; @@ -692,8 +692,7 @@ static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (fid) p9_fid_put(fid); - - return err; + return ERR_PTR(err); } /** diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 143ac03b7425..cc2007be2173 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -350,9 +350,9 @@ out: * */ -static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, - umode_t omode) +static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t omode) { int err; struct v9fs_session_info *v9ses; @@ -417,7 +417,7 @@ error: p9_fid_put(fid); v9fs_put_acl(dacl, pacl); p9_fid_put(dfid); - return err; + return ERR_PTR(err); } static int diff --git a/fs/Kconfig b/fs/Kconfig index 64d420e3c475..afe21866d6b4 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -336,7 +336,6 @@ source "fs/qnx4/Kconfig" source "fs/qnx6/Kconfig" source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" -source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/erofs/Kconfig" source "fs/vboxsf/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 15df0a923d3a..77fd7f7b5d02 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -87,7 +87,6 @@ obj-$(CONFIG_NFSD) += nfsd/ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ obj-y += unicode/ -obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS) += smb/ obj-$(CONFIG_HPFS_FS) += hpfs/ obj-$(CONFIG_NTFS3_FS) += ntfs3/ diff --git a/fs/affs/affs.h b/fs/affs/affs.h index e8c2c4535cb3..ac4e9a02910b 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -168,7 +168,7 @@ extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsi extern int affs_unlink(struct inode *dir, struct dentry *dentry); extern int affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool); -extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, +extern struct dentry *affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 8c154490a2d6..f883be50db12 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -273,7 +273,7 @@ affs_create(struct mnt_idmap *idmap, struct inode *dir, return 0; } -int +struct dentry * affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -285,7 +285,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, inode = affs_new_inode(dir); if (!inode) - return -ENOSPC; + return ERR_PTR(-ENOSPC); inode->i_mode = S_IFDIR | mode; affs_mode_to_prot(inode); @@ -298,9 +298,9 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, clear_nlink(inode); mark_inode_dirty(inode); iput(inode); - return error; + return ERR_PTR(error); } - return 0; + return NULL; } int diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 6d42f85c6be5..e941da5b6dd9 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -362,3 +362,53 @@ int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist, alist->nr_addrs++; return 0; } + +/* + * Set the app data on the rxrpc peers an address list points to + */ +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist) +{ + unsigned long data = (unsigned long)server; + int n = 0, o = 0; + + if (!old_alist) { + /* New server. Just set all. */ + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + return; + } + if (!new_alist) { + /* Dead server. Just remove all. */ + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); + return; + } + + /* Walk through the two lists simultaneously, setting new peers and + * clearing old ones. The two lists are ordered by pointer to peer + * record. + */ + while (n < new_alist->nr_addrs && o < old_alist->nr_addrs) { + struct rxrpc_peer *pn = new_alist->addrs[n].peer; + struct rxrpc_peer *po = old_alist->addrs[o].peer; + + if (pn == po) + continue; + if (pn < po) { + rxrpc_kernel_set_peer_data(pn, data); + n++; + } else { + rxrpc_kernel_set_peer_data(po, 0); + o++; + } + } + + if (n < new_alist->nr_addrs) + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + if (o < old_alist->nr_addrs) + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); +} diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 96a6781f3653..0168bbf53fe0 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -20,8 +20,9 @@ static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; static atomic_t cell_debug_id; -static void afs_queue_cell_manager(struct afs_net *); -static void afs_manage_cell_work(struct work_struct *); +static void afs_cell_timer(struct timer_list *timer); +static void afs_destroy_cell_work(struct work_struct *work); +static void afs_manage_cell_work(struct work_struct *work); static void afs_dec_cells_outstanding(struct afs_net *net) { @@ -29,19 +30,11 @@ static void afs_dec_cells_outstanding(struct afs_net *net) wake_up_var(&net->cells_outstanding); } -/* - * Set the cell timer to fire after a given delay, assuming it's not already - * set for an earlier time. - */ -static void afs_set_cell_timer(struct afs_net *net, time64_t delay) +static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state) { - if (net->live) { - atomic_inc(&net->cells_outstanding); - if (timer_reduce(&net->cells_timer, jiffies + delay * HZ)) - afs_dec_cells_outstanding(net); - } else { - afs_queue_cell_manager(net); - } + smp_store_release(&cell->state, state); /* Commit cell changes before state */ + smp_wmb(); /* Set cell state before task state */ + wake_up_var(&cell->state); } /* @@ -116,7 +109,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, const char *name, unsigned int namelen, const char *addresses) { - struct afs_vlserver_list *vllist; + struct afs_vlserver_list *vllist = NULL; struct afs_cell *cell; int i, ret; @@ -163,13 +156,15 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->net = net; refcount_set(&cell->ref, 1); atomic_set(&cell->active, 0); + INIT_WORK(&cell->destroyer, afs_destroy_cell_work); INIT_WORK(&cell->manager, afs_manage_cell_work); + timer_setup(&cell->management_timer, afs_cell_timer, 0); init_rwsem(&cell->vs_lock); cell->volumes = RB_ROOT; INIT_HLIST_HEAD(&cell->proc_volumes); seqlock_init(&cell->volume_lock); cell->fs_servers = RB_ROOT; - seqlock_init(&cell->fs_lock); + init_rwsem(&cell->fs_lock); rwlock_init(&cell->vl_servers_lock); cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); @@ -204,7 +199,13 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ atomic_inc(&net->cells_outstanding); + ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell, + 2, INT_MAX / 2, GFP_KERNEL); + if (ret < 0) + goto error; + cell->dynroot_ino = ret; cell->debug_id = atomic_inc_return(&cell_debug_id); + trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc); _leave(" = %p", cell); @@ -214,6 +215,7 @@ parse_failed: if (ret == -EINVAL) printk(KERN_ERR "kAFS: bad VL server IP address\n"); error: + afs_put_vlserverlist(cell->net, vllist); kfree(cell->name - 1); kfree(cell); _leave(" = %d", ret); @@ -227,6 +229,7 @@ error: * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. * @excl: T if an error should be given if the cell name already exists. + * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if * needed. Note that that actual DNS query is punted off to the manager thread @@ -235,7 +238,8 @@ error: */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl) + const char *vllist, bool excl, + enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; struct rb_node *parent, **pp; @@ -245,7 +249,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, _enter("%s,%s", name, vllist); if (!excl) { - cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup); + cell = afs_find_cell(net, name, namesz, trace); if (!IS_ERR(cell)) goto wait_for_cell; } @@ -288,26 +292,28 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, cell = candidate; candidate = NULL; - atomic_set(&cell->active, 2); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert); + afs_use_cell(cell, trace); rb_link_node_rcu(&cell->net_node, parent, pp); rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell, afs_cell_trace_get_queue_new); + afs_queue_cell(cell, afs_cell_trace_queue_new); wait_for_cell: - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active), - afs_cell_trace_wait); _debug("wait_for_cell"); - wait_var_event(&cell->state, - ({ - state = smp_load_acquire(&cell->state); /* vs error */ - state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED; - })); + state = smp_load_acquire(&cell->state); /* vs error */ + if (state != AFS_CELL_ACTIVE && + state != AFS_CELL_DEAD) { + afs_see_cell(cell, afs_cell_trace_wait); + wait_var_event(&cell->state, + ({ + state = smp_load_acquire(&cell->state); /* vs error */ + state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; + })); + } /* Check the state obtained from the wait check. */ - if (state == AFS_CELL_REMOVED) { + if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } @@ -321,7 +327,7 @@ cell_already_exists: if (excl) { ret = -EEXIST; } else { - afs_use_cell(cursor, afs_cell_trace_use_lookup); + afs_use_cell(cursor, trace); ret = 0; } up_write(&net->cells_lock); @@ -331,7 +337,7 @@ cell_already_exists: goto wait_for_cell; goto error_noput; error: - afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup); + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error); error_noput: _leave(" = %d [error]", ret); return ERR_PTR(ret); @@ -376,8 +382,9 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) if (cp && cp < rootcell + len) return -EINVAL; - /* allocate a cell record for the root cell */ - new_root = afs_lookup_cell(net, rootcell, len, vllist, false); + /* allocate a cell record for the root/workstation cell */ + new_root = afs_lookup_cell(net, rootcell, len, vllist, false, + afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); return PTR_ERR(new_root); @@ -388,12 +395,11 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) /* install the new cell */ down_write(&net->cells_lock); - afs_see_cell(new_root, afs_cell_trace_see_ws); old_root = rcu_replace_pointer(net->ws_cell, new_root, lockdep_is_held(&net->cells_lock)); up_write(&net->cells_lock); - afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws); + afs_unuse_cell(old_root, afs_cell_trace_unuse_ws); _leave(" = 0"); return 0; } @@ -511,8 +517,9 @@ static void afs_cell_destroy(struct rcu_head *rcu) trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free); afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); - afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias); + afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias); key_put(cell->anonymous_key); + idr_remove(&net->cells_dyn_ino, cell->dynroot_ino); kfree(cell->name - 1); kfree(cell); @@ -520,30 +527,14 @@ static void afs_cell_destroy(struct rcu_head *rcu) _leave(" [destroyed]"); } -/* - * Queue the cell manager. - */ -static void afs_queue_cell_manager(struct afs_net *net) -{ - int outstanding = atomic_inc_return(&net->cells_outstanding); - - _enter("%d", outstanding); - - if (!queue_work(afs_wq, &net->cells_manager)) - afs_dec_cells_outstanding(net); -} - -/* - * Cell management timer. We have an increment on cells_outstanding that we - * need to pass along to the work item. - */ -void afs_cells_timer(struct timer_list *timer) +static void afs_destroy_cell_work(struct work_struct *work) { - struct afs_net *net = container_of(timer, struct afs_net, cells_timer); + struct afs_cell *cell = container_of(work, struct afs_cell, destroyer); - _enter(""); - if (!queue_work(afs_wq, &net->cells_manager)) - afs_dec_cells_outstanding(net); + afs_see_cell(cell, afs_cell_trace_destroy); + timer_delete_sync(&cell->management_timer); + cancel_work_sync(&cell->manager); + call_rcu(&cell->rcu, afs_cell_destroy); } /* @@ -575,7 +566,7 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason) if (zero) { a = atomic_read(&cell->active); WARN(a != 0, "Cell active count %u > 0\n", a); - call_rcu(&cell->rcu, afs_cell_destroy); + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } } } @@ -587,10 +578,9 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) { int r, a; - r = refcount_read(&cell->ref); - WARN_ON(r == 0); + __refcount_inc(&cell->ref, &r); a = atomic_inc_return(&cell->active); - trace_afs_cell(cell->debug_id, r, a, reason); + trace_afs_cell(cell->debug_id, r + 1, a, reason); return cell; } @@ -598,10 +588,11 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) * Record a cell becoming less active. When the active counter reaches 1, it * is scheduled for destruction, but may get reactivated. */ -void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason) +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason) { unsigned int debug_id; time64_t now, expire_delay; + bool zero; int r, a; if (!cell) @@ -616,13 +607,15 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr expire_delay = afs_cell_gc_delay; debug_id = cell->debug_id; - r = refcount_read(&cell->ref); a = atomic_dec_return(&cell->active); - trace_afs_cell(debug_id, r, a, reason); - WARN_ON(a == 0); - if (a == 1) + if (!a) /* 'cell' may now be garbage collected. */ - afs_set_cell_timer(net, expire_delay); + afs_set_cell_timer(cell, expire_delay); + + zero = __refcount_dec_and_test(&cell->ref, &r); + trace_afs_cell(debug_id, r - 1, a, reason); + if (zero) + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } /* @@ -642,9 +635,27 @@ void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason) */ void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason) { - afs_get_cell(cell, reason); - if (!queue_work(afs_wq, &cell->manager)) - afs_put_cell(cell, afs_cell_trace_put_queue_fail); + queue_work(afs_wq, &cell->manager); +} + +/* + * Cell-specific management timer. + */ +static void afs_cell_timer(struct timer_list *timer) +{ + struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer); + + afs_see_cell(cell, afs_cell_trace_see_mgmt_timer); + if (refcount_read(&cell->ref) > 0 && cell->net->live) + queue_work(afs_wq, &cell->manager); +} + +/* + * Set/reduce the cell timer. + */ +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs) +{ + timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ); } /* @@ -706,7 +717,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) if (cell->proc_link.next) cell->proc_link.next->pprev = &cell->proc_link.next; - afs_dynroot_mkdir(net, cell); mutex_unlock(&net->proc_cells_lock); return 0; } @@ -723,217 +733,130 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) mutex_lock(&net->proc_cells_lock); if (!hlist_unhashed(&cell->proc_link)) hlist_del_rcu(&cell->proc_link); - afs_dynroot_rmdir(net, cell); mutex_unlock(&net->proc_cells_lock); _leave(""); } +static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage) +{ + const struct afs_vlserver_list *vllist; + time64_t expire_at = cell->last_inactive; + time64_t now = ktime_get_real_seconds(); + + if (atomic_read(&cell->active)) + return false; + if (!cell->net->live) + return true; + + vllist = rcu_dereference_protected(cell->vl_servers, true); + if (vllist && vllist->nr_servers > 0) + expire_at += afs_cell_gc_delay; + + if (expire_at <= now) + return true; + if (expire_at < *_next_manage) + *_next_manage = expire_at; + return false; +} + /* * Manage a cell record, initialising and destroying it, maintaining its DNS * records. */ -static void afs_manage_cell(struct afs_cell *cell) +static bool afs_manage_cell(struct afs_cell *cell) { struct afs_net *net = cell->net; - int ret, active; + time64_t next_manage = TIME64_MAX; + int ret; _enter("%s", cell->name); -again: _debug("state %u", cell->state); switch (cell->state) { - case AFS_CELL_INACTIVE: - case AFS_CELL_FAILED: - down_write(&net->cells_lock); - active = 1; - if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) { - rb_erase(&cell->net_node, &net->cells); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0, - afs_cell_trace_unuse_delete); - smp_store_release(&cell->state, AFS_CELL_REMOVED); - } - up_write(&net->cells_lock); - if (cell->state == AFS_CELL_REMOVED) { - wake_up_var(&cell->state); - goto final_destruction; - } - if (cell->state == AFS_CELL_FAILED) - goto done; - smp_store_release(&cell->state, AFS_CELL_UNSET); - wake_up_var(&cell->state); - goto again; - - case AFS_CELL_UNSET: - smp_store_release(&cell->state, AFS_CELL_ACTIVATING); - wake_up_var(&cell->state); - goto again; - - case AFS_CELL_ACTIVATING: - ret = afs_activate_cell(net, cell); - if (ret < 0) - goto activation_failed; + case AFS_CELL_SETTING_UP: + goto set_up_cell; + case AFS_CELL_ACTIVE: + goto cell_is_active; + case AFS_CELL_REMOVING: + WARN_ON_ONCE(1); + return false; + case AFS_CELL_DEAD: + return false; + default: + _debug("bad state %u", cell->state); + WARN_ON_ONCE(1); /* Unhandled state */ + return false; + } - smp_store_release(&cell->state, AFS_CELL_ACTIVE); - wake_up_var(&cell->state); - goto again; +set_up_cell: + ret = afs_activate_cell(net, cell); + if (ret < 0) { + cell->error = ret; + goto remove_cell; + } - case AFS_CELL_ACTIVE: - if (atomic_read(&cell->active) > 1) { - if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { - ret = afs_update_cell(cell); - if (ret < 0) - cell->error = ret; - } - goto done; - } - smp_store_release(&cell->state, AFS_CELL_DEACTIVATING); - wake_up_var(&cell->state); - goto again; + afs_set_cell_state(cell, AFS_CELL_ACTIVE); - case AFS_CELL_DEACTIVATING: - if (atomic_read(&cell->active) > 1) - goto reverse_deactivation; - afs_deactivate_cell(net, cell); - smp_store_release(&cell->state, AFS_CELL_INACTIVE); - wake_up_var(&cell->state); - goto again; +cell_is_active: + if (afs_has_cell_expired(cell, &next_manage)) + goto remove_cell; - case AFS_CELL_REMOVED: - goto done; + if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { + ret = afs_update_cell(cell); + if (ret < 0) + cell->error = ret; + } - default: - break; + if (next_manage < TIME64_MAX && cell->net->live) { + time64_t now = ktime_get_real_seconds(); + + if (next_manage - now <= 0) + afs_queue_cell(cell, afs_cell_trace_queue_again); + else + afs_set_cell_timer(cell, next_manage - now); } - _debug("bad state %u", cell->state); - BUG(); /* Unhandled state */ + _leave(" [done %u]", cell->state); + return false; -activation_failed: - cell->error = ret; - afs_deactivate_cell(net, cell); +remove_cell: + down_write(&net->cells_lock); - smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */ - wake_up_var(&cell->state); - goto again; + if (atomic_read(&cell->active)) { + up_write(&net->cells_lock); + goto cell_is_active; + } -reverse_deactivation: - smp_store_release(&cell->state, AFS_CELL_ACTIVE); - wake_up_var(&cell->state); - _leave(" [deact->act]"); - return; + /* Make sure that the expiring server records are going to see the fact + * that the cell is caput. + */ + afs_set_cell_state(cell, AFS_CELL_REMOVING); -done: - _leave(" [done %u]", cell->state); - return; + afs_deactivate_cell(net, cell); + afs_purge_servers(cell); + + rb_erase(&cell->net_node, &net->cells); + afs_see_cell(cell, afs_cell_trace_unuse_delete); + up_write(&net->cells_lock); -final_destruction: /* The root volume is pinning the cell */ afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root); cell->root_volume = NULL; - afs_put_cell(cell, afs_cell_trace_put_destroy); + + afs_set_cell_state(cell, AFS_CELL_DEAD); + return true; } static void afs_manage_cell_work(struct work_struct *work) { struct afs_cell *cell = container_of(work, struct afs_cell, manager); + bool final_put; - afs_manage_cell(cell); - afs_put_cell(cell, afs_cell_trace_put_queue_work); -} - -/* - * Manage the records of cells known to a network namespace. This includes - * updating the DNS records and garbage collecting unused cells that were - * automatically added. - * - * Note that constructed cell records may only be removed from net->cells by - * this work item, so it is safe for this work item to stash a cursor pointing - * into the tree and then return to caller (provided it skips cells that are - * still under construction). - * - * Note also that we were given an increment on net->cells_outstanding by - * whoever queued us that we need to deal with before returning. - */ -void afs_manage_cells(struct work_struct *work) -{ - struct afs_net *net = container_of(work, struct afs_net, cells_manager); - struct rb_node *cursor; - time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; - bool purging = !net->live; - - _enter(""); - - /* Trawl the cell database looking for cells that have expired from - * lack of use and cells whose DNS results have expired and dispatch - * their managers. - */ - down_read(&net->cells_lock); - - for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { - struct afs_cell *cell = - rb_entry(cursor, struct afs_cell, net_node); - unsigned active; - bool sched_cell = false; - - active = atomic_read(&cell->active); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), - active, afs_cell_trace_manage); - - ASSERTCMP(active, >=, 1); - - if (purging) { - if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) { - active = atomic_dec_return(&cell->active); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), - active, afs_cell_trace_unuse_pin); - } - } - - if (active == 1) { - struct afs_vlserver_list *vllist; - time64_t expire_at = cell->last_inactive; - - read_lock(&cell->vl_servers_lock); - vllist = rcu_dereference_protected( - cell->vl_servers, - lockdep_is_held(&cell->vl_servers_lock)); - if (vllist->nr_servers > 0) - expire_at += afs_cell_gc_delay; - read_unlock(&cell->vl_servers_lock); - if (purging || expire_at <= now) - sched_cell = true; - else if (expire_at < next_manage) - next_manage = expire_at; - } - - if (!purging) { - if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) - sched_cell = true; - } - - if (sched_cell) - afs_queue_cell(cell, afs_cell_trace_get_queue_manage); - } - - up_read(&net->cells_lock); - - /* Update the timer on the way out. We have to pass an increment on - * cells_outstanding in the namespace that we are in to the timer or - * the work scheduler. - */ - if (!purging && next_manage < TIME64_MAX) { - now = ktime_get_real_seconds(); - - if (next_manage - now <= 0) { - if (queue_work(afs_wq, &net->cells_manager)) - atomic_inc(&net->cells_outstanding); - } else { - afs_set_cell_timer(net, next_manage - now); - } - } - - afs_dec_cells_outstanding(net); - _leave(" [%d]", atomic_read(&net->cells_outstanding)); + afs_see_cell(cell, afs_cell_trace_manage); + final_put = afs_manage_cell(cell); + afs_see_cell(cell, afs_cell_trace_managed); + if (final_put) + afs_put_cell(cell, afs_cell_trace_put_final); } /* @@ -942,6 +865,7 @@ void afs_manage_cells(struct work_struct *work) void afs_cell_purge(struct afs_net *net) { struct afs_cell *ws; + struct rb_node *cursor; _enter(""); @@ -949,14 +873,21 @@ void afs_cell_purge(struct afs_net *net) ws = rcu_replace_pointer(net->ws_cell, NULL, lockdep_is_held(&net->cells_lock)); up_write(&net->cells_lock); - afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws); + afs_unuse_cell(ws, afs_cell_trace_unuse_ws); - _debug("del timer"); - if (del_timer_sync(&net->cells_timer)) - atomic_dec(&net->cells_outstanding); + _debug("kick cells"); + down_read(&net->cells_lock); + for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { + struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node); + + afs_see_cell(cell, afs_cell_trace_purge); - _debug("kick mgr"); - afs_queue_cell_manager(net); + if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + afs_unuse_cell(cell, afs_cell_trace_unuse_pin); + + afs_queue_cell(cell, afs_cell_trace_queue_purge); + } + up_read(&net->cells_lock); _debug("wait"); wait_var_event(&net->cells_outstanding, diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 99a3f20bc786..1a906805a9e3 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -139,49 +139,6 @@ bool afs_cm_incoming_call(struct afs_call *call) } /* - * Find the server record by peer address and record a probe to the cache - * manager from a server. - */ -static int afs_find_cm_server_by_peer(struct afs_call *call) -{ - struct sockaddr_rxrpc srx; - struct afs_server *server; - struct rxrpc_peer *peer; - - peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall); - - server = afs_find_server(call->net, peer); - if (!server) { - trace_afs_cm_no_server(call, &srx); - return 0; - } - - call->server = server; - return 0; -} - -/* - * Find the server record by server UUID and record a probe to the cache - * manager from a server. - */ -static int afs_find_cm_server_by_uuid(struct afs_call *call, - struct afs_uuid *uuid) -{ - struct afs_server *server; - - rcu_read_lock(); - server = afs_find_server_by_uuid(call->net, call->request); - rcu_read_unlock(); - if (!server) { - trace_afs_cm_no_server_u(call, call->request); - return 0; - } - - call->server = server; - return 0; -} - -/* * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) @@ -322,10 +279,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -349,18 +303,10 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) */ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { - int ret; - _enter(""); afs_extract_discard(call, 0); - ret = afs_extract_data(call, false); - if (ret < 0) - return ret; - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_peer(call); + return afs_extract_data(call, false); } /* @@ -373,8 +319,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) __be32 *b; int ret; - _enter(""); - _enter("{%u}", call->unmarshall); switch (call->unmarshall) { @@ -421,9 +365,13 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_uuid(call, call->request); + if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) { + pr_notice("Callback UUID does not match fileserver UUID\n"); + trace_afs_cm_no_server_u(call, call->request); + return 0; + } + + return 0; } /* @@ -455,7 +403,7 @@ static int afs_deliver_cb_probe(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -533,7 +481,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -593,7 +541,7 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -667,9 +615,5 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - - /* We'll need the file server record as that tells us which set of - * vnodes to operate upon. - */ - return afs_find_cm_server_by_peer(call); + return 0; } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 02cbf38e1a77..9e7b1fe82c27 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -33,8 +33,8 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nl loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); -static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode); +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, @@ -1004,9 +1004,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, afs_stat_v(dvnode, n_lookup); inode = afs_do_lookup(dir, dentry); if (inode == ERR_PTR(-ENOENT)) - inode = afs_try_auto_mntpt(dentry, dir); - - if (!IS_ERR_OR_NULL(inode)) + inode = NULL; + else if (!IS_ERR_OR_NULL(inode)) fid = AFS_FS_I(inode)->fid; _debug("splice %p", dentry->d_inode); @@ -1315,8 +1314,8 @@ static const struct afs_operation_ops afs_mkdir_operation = { /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); @@ -1328,7 +1327,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, op = afs_alloc_operation(NULL, dvnode->volume); if (IS_ERR(op)) { d_drop(dentry); - return PTR_ERR(op); + return ERR_CAST(op); } fscache_use_cookie(afs_vnode_cache(dvnode), true); @@ -1344,7 +1343,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, op->ops = &afs_mkdir_operation; ret = afs_do_sync_operation(op); afs_dir_unuse_cookie(dvnode, ret); - return ret; + return ERR_PTR(ret); } /* diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 7d997f7a8028..691e0ae607a1 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,16 +10,19 @@ #include <linux/dns_resolver.h> #include "internal.h" -static atomic_t afs_autocell_ino; +#define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */ +#define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX) + +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino); /* * iget5() comparator for inode created by autocell operations - * - * These pseudo inodes don't match anything. */ static int afs_iget5_pseudo_test(struct inode *inode, void *opaque) { - return 0; + struct afs_fid *fid = opaque; + + return inode->i_ino == fid->vnode; } /* @@ -39,28 +42,16 @@ static int afs_iget5_pseudo_set(struct inode *inode, void *opaque) } /* - * Create an inode for a dynamic root directory or an autocell dynamic - * automount dir. + * Create an inode for an autocell dynamic automount dir. */ -struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) +static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) { - struct afs_super_info *as = AFS_FS_S(sb); struct afs_vnode *vnode; struct inode *inode; - struct afs_fid fid = {}; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; _enter(""); - if (as->volume) - fid.vid = as->volume->vid; - if (root) { - fid.vnode = 1; - fid.unique = 1; - } else { - fid.vnode = atomic_inc_return(&afs_autocell_ino); - fid.unique = 0; - } - inode = iget5_locked(sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); if (!inode) { @@ -73,115 +64,71 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) vnode = AFS_FS_I(inode); - /* there shouldn't be an existing inode */ - BUG_ON(!(inode->i_state & I_NEW)); - - netfs_inode_init(&vnode->netfs, NULL, false); - inode->i_size = 0; - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - if (root) { - inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &simple_dir_operations; - } else { - inode->i_op = &afs_autocell_inode_operations; - } - set_nlink(inode, 2); - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - simple_inode_init_ts(inode); - inode->i_blocks = 0; - inode->i_generation = 0; - - set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); - if (!root) { + if (inode->i_state & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_autocell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_AUTOMOUNT | S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - inode->i_flags |= S_AUTOMOUNT; - } - inode->i_flags |= S_NOATIME; - unlock_new_inode(inode); + unlock_new_inode(inode); + } _leave(" = %p", inode); return inode; } /* - * Probe to see if a cell may exist. This prevents positive dentries from - * being created unnecessarily. + * Try to automount the mountpoint with pseudo directory, if the autocell + * option is set. */ -static int afs_probe_cell_name(struct dentry *dentry) +static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry, + unsigned int flags) { - struct afs_cell *cell; + struct afs_cell *cell = NULL; struct afs_net *net = afs_d2net(dentry); + struct inode *inode = NULL; const char *name = dentry->d_name.name; size_t len = dentry->d_name.len; - char *result = NULL; - int ret; + bool dotted = false; + int ret = -ENOENT; /* Names prefixed with a dot are R/W mounts. */ if (name[0] == '.') { - if (len == 1) - return -EINVAL; name++; len--; + dotted = true; } - cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe); - if (!IS_ERR(cell)) { - afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe); - return 0; - } - - ret = dns_query(net->net, "afsdb", name, len, "srv=1", - &result, NULL, false); - if (ret == -ENODATA || ret == -ENOKEY || ret == 0) - ret = -ENOENT; - if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) { - struct dns_server_list_v1_header *v1 = (void *)result; - - if (v1->hdr.zero == 0 && - v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST && - v1->hdr.version == 1 && - (v1->status != DNS_LOOKUP_GOOD && - v1->status != DNS_LOOKUP_GOOD_WITH_BAD)) - return -ENOENT; - + cell = afs_lookup_cell(net, name, len, NULL, false, + afs_cell_trace_use_lookup_dynroot); + if (IS_ERR(cell)) { + ret = PTR_ERR(cell); + goto out_no_cell; } - kfree(result); - return ret; -} - -/* - * Try to auto mount the mountpoint with pseudo directory, if the autocell - * operation is setted. - */ -struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) -{ - struct afs_vnode *vnode = AFS_FS_I(dir); - struct inode *inode; - int ret = -ENOENT; - - _enter("%p{%pd}, {%llx:%llu}", - dentry, dentry, vnode->fid.vid, vnode->fid.vnode); - - if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) - goto out; - - ret = afs_probe_cell_name(dentry); - if (ret < 0) - goto out; - - inode = afs_iget_pseudo_dir(dir->i_sb, false); + inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out; } - _leave("= %p", inode); - return inode; + dentry->d_fsdata = cell; + return d_splice_alias(inode, dentry); out: - _leave("= %d", ret); + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot); +out_no_cell: + if (!inode) + return d_splice_alias(inode, dentry); return ret == -ENOENT ? NULL : ERR_PTR(ret); } @@ -193,8 +140,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr { _enter("%pd", dentry); - ASSERTCMP(d_inode(dentry), ==, NULL); - if (flags & LOOKUP_CREATE) return ERR_PTR(-EOPNOTSUPP); @@ -203,98 +148,49 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr return ERR_PTR(-ENAMETOOLONG); } - return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry); + if (dentry->d_name.len == 5 && + memcmp(dentry->d_name.name, "@cell", 5) == 0) + return afs_lookup_atcell(dir, dentry, 2); + + if (dentry->d_name.len == 6 && + memcmp(dentry->d_name.name, ".@cell", 6) == 0) + return afs_lookup_atcell(dir, dentry, 3); + + return afs_dynroot_lookup_cell(dir, dentry, flags); } const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; -const struct dentry_operations afs_dynroot_dentry_operations = { - .d_delete = always_delete_dentry, - .d_release = afs_d_release, - .d_automount = afs_d_automount, -}; - -/* - * Create a manually added cell mount directory. - * - The caller must hold net->proc_cells_lock - */ -int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell) -{ - struct super_block *sb = net->dynroot_sb; - struct dentry *root, *subdir, *dsubdir; - char *dotname = cell->name - 1; - int ret; - - if (!sb || atomic_read(&sb->s_active) == 0) - return 0; - - /* Let the ->lookup op do the creation */ - root = sb->s_root; - inode_lock(root->d_inode); - subdir = lookup_one_len(cell->name, root, cell->name_len); - if (IS_ERR(subdir)) { - ret = PTR_ERR(subdir); - goto unlock; - } - - dsubdir = lookup_one_len(dotname, root, cell->name_len + 1); - if (IS_ERR(dsubdir)) { - ret = PTR_ERR(dsubdir); - dput(subdir); - goto unlock; - } - - /* Note that we're retaining extra refs on the dentries. */ - subdir->d_fsdata = (void *)1UL; - dsubdir->d_fsdata = (void *)1UL; - ret = 0; -unlock: - inode_unlock(root->d_inode); - return ret; -} - -static void afs_dynroot_rm_one_dir(struct dentry *root, const char *name, size_t name_len) +static void afs_dynroot_d_release(struct dentry *dentry) { - struct dentry *subdir; - - /* Don't want to trigger a lookup call, which will re-add the cell */ - subdir = try_lookup_one_len(name, root, name_len); - if (IS_ERR_OR_NULL(subdir)) { - _debug("lookup %ld", PTR_ERR(subdir)); - return; - } + struct afs_cell *cell = dentry->d_fsdata; - _debug("rmdir %pd %u", subdir, d_count(subdir)); - - if (subdir->d_fsdata) { - _debug("unpin %u", d_count(subdir)); - subdir->d_fsdata = NULL; - dput(subdir); - } - dput(subdir); + afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt); } /* - * Remove a manually added cell mount directory. - * - The caller must hold net->proc_cells_lock + * Keep @cell symlink dentries around, but only keep cell autodirs when they're + * being used. */ -void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell) +static int afs_dynroot_delete_dentry(const struct dentry *dentry) { - struct super_block *sb = net->dynroot_sb; - char *dotname = cell->name - 1; - - if (!sb || atomic_read(&sb->s_active) == 0) - return; + const struct qstr *name = &dentry->d_name; - inode_lock(sb->s_root->d_inode); - afs_dynroot_rm_one_dir(sb->s_root, cell->name, cell->name_len); - afs_dynroot_rm_one_dir(sb->s_root, dotname, cell->name_len + 1); - inode_unlock(sb->s_root->d_inode); - _leave(""); + if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0) + return 0; + if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0) + return 0; + return 1; } +const struct dentry_operations afs_dynroot_dentry_operations = { + .d_delete = afs_dynroot_delete_dentry, + .d_release = afs_dynroot_d_release, + .d_automount = afs_d_automount, +}; + static void afs_atcell_delayed_put_cell(void *arg) { struct afs_cell *cell = arg; @@ -347,149 +243,163 @@ static const struct inode_operations afs_atcell_inode_operations = { }; /* - * Look up @cell or .@cell in a dynroot directory. This is a substitution for - * the local cell name for the net namespace. + * Create an inode for the @cell or .@cell symlinks. */ -static struct dentry *afs_dynroot_create_symlink(struct dentry *root, const char *name) +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino) { struct afs_vnode *vnode; - struct afs_fid fid = { .vnode = 2, .unique = 1, }; - struct dentry *dentry; struct inode *inode; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; - if (name[0] == '.') - fid.vnode = 3; - - dentry = d_alloc_name(root, name); - if (!dentry) - return ERR_PTR(-ENOMEM); - - inode = iget5_locked(dentry->d_sb, fid.vnode, + inode = iget5_locked(dir->i_sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); - if (!inode) { - dput(dentry); + if (!inode) return ERR_PTR(-ENOMEM); - } vnode = AFS_FS_I(inode); - /* there shouldn't be an existing inode */ - if (WARN_ON_ONCE(!(inode->i_state & I_NEW))) { - iput(inode); - dput(dentry); - return ERR_PTR(-EIO); + if (inode->i_state & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 1); + inode->i_size = 0; + inode->i_mode = S_IFLNK | 0555; + inode->i_op = &afs_atcell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + unlock_new_inode(inode); } - - netfs_inode_init(&vnode->netfs, NULL, false); - simple_inode_init_ts(inode); - set_nlink(inode, 1); - inode->i_size = 0; - inode->i_mode = S_IFLNK | 0555; - inode->i_op = &afs_atcell_inode_operations; - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - inode->i_blocks = 0; - inode->i_generation = 0; - inode->i_flags |= S_NOATIME; - - unlock_new_inode(inode); - d_splice_alias(inode, dentry); - return dentry; + return d_splice_alias(inode, dentry); } /* - * Create @cell and .@cell symlinks. + * Transcribe the cell database into readdir content under the RCU read lock. + * Each cell produces two entries, one prefixed with a dot and one not. */ -static int afs_dynroot_symlink(struct afs_net *net) +static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx) { - struct super_block *sb = net->dynroot_sb; - struct dentry *root, *symlink, *dsymlink; - int ret; - - /* Let the ->lookup op do the creation */ - root = sb->s_root; - inode_lock(root->d_inode); - symlink = afs_dynroot_create_symlink(root, "@cell"); - if (IS_ERR(symlink)) { - ret = PTR_ERR(symlink); - goto unlock; - } + const struct afs_cell *cell; + loff_t newpos; + + _enter("%llu", ctx->pos); + + for (;;) { + unsigned int ix = ctx->pos >> 1; + + cell = idr_get_next(&net->cells_dyn_ino, &ix); + if (!cell) + return 0; + if (READ_ONCE(cell->state) == AFS_CELL_REMOVING || + READ_ONCE(cell->state) == AFS_CELL_DEAD) { + ctx->pos += 2; + ctx->pos &= ~1; + continue; + } - dsymlink = afs_dynroot_create_symlink(root, ".@cell"); - if (IS_ERR(dsymlink)) { - ret = PTR_ERR(dsymlink); - dput(symlink); - goto unlock; - } + newpos = ix << 1; + if (newpos > ctx->pos) + ctx->pos = newpos; - /* Note that we're retaining extra refs on the dentries. */ - symlink->d_fsdata = (void *)1UL; - dsymlink->d_fsdata = (void *)1UL; - ret = 0; -unlock: - inode_unlock(root->d_inode); - return ret; + _debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino); + + if ((ctx->pos & 1) == 0) { + if (!dir_emit(ctx, cell->name, cell->name_len, + cell->dynroot_ino, DT_DIR)) + return 0; + ctx->pos++; + } + if ((ctx->pos & 1) == 1) { + if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1, + cell->dynroot_ino + 1, DT_DIR)) + return 0; + ctx->pos++; + } + } + return 0; } /* - * Populate a newly created dynamic root with cell names. + * Read the AFS dynamic root directory. This produces a list of cellnames, + * dotted and undotted, along with @cell and .@cell links if configured. */ -int afs_dynroot_populate(struct super_block *sb) +static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx) { - struct afs_cell *cell; - struct afs_net *net = afs_sb2net(sb); - int ret; - - mutex_lock(&net->proc_cells_lock); + struct afs_net *net = afs_d2net(file->f_path.dentry); + int ret = 0; - net->dynroot_sb = sb; - ret = afs_dynroot_symlink(net); - if (ret < 0) - goto error; + if (!dir_emit_dots(file, ctx)) + return 0; - hlist_for_each_entry(cell, &net->proc_cells, proc_link) { - ret = afs_dynroot_mkdir(net, cell); - if (ret < 0) - goto error; + if (ctx->pos == 2) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, "@cell", 5, 2, DT_LNK)) + return 0; + ctx->pos = 3; + } + if (ctx->pos == 3) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, ".@cell", 6, 3, DT_LNK)) + return 0; + ctx->pos = 4; } - ret = 0; -out: - mutex_unlock(&net->proc_cells_lock); + if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) { + rcu_read_lock(); + ret = afs_dynroot_readdir_cells(net, ctx); + rcu_read_unlock(); + } return ret; - -error: - net->dynroot_sb = NULL; - goto out; } +static const struct file_operations afs_dynroot_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = afs_dynroot_readdir, + .fsync = noop_fsync, +}; + /* - * When a dynamic root that's in the process of being destroyed, depopulate it - * of pinned directories. + * Create an inode for a dynamic root directory. */ -void afs_dynroot_depopulate(struct super_block *sb) +struct inode *afs_dynroot_iget_root(struct super_block *sb) { - struct afs_net *net = afs_sb2net(sb); - struct dentry *root = sb->s_root, *subdir; - - /* Prevent more subdirs from being created */ - mutex_lock(&net->proc_cells_lock); - if (net->dynroot_sb == sb) - net->dynroot_sb = NULL; - mutex_unlock(&net->proc_cells_lock); - - if (root) { - struct hlist_node *n; - inode_lock(root->d_inode); - - /* Remove all the pins for dirs created for manually added cells */ - hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) { - if (subdir->d_fsdata) { - subdir->d_fsdata = NULL; - dput(subdir); - } - } + struct afs_super_info *as = AFS_FS_S(sb); + struct afs_vnode *vnode; + struct inode *inode; + struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,}; + + if (as->volume) + fid.vid = as->volume->vid; - inode_unlock(root->d_inode); + inode = iget5_locked(sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) + return ERR_PTR(-ENOMEM); + + vnode = AFS_FS_I(inode); + + /* there shouldn't be an existing inode */ + if (inode->i_state & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_dynroot_inode_operations; + inode->i_fop = &afs_dynroot_file_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); + unlock_new_inode(inode); } + _leave(" = %p", inode); + return inode; } diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index b516d05b0fef..07a8bfbdd9b9 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -235,20 +235,20 @@ out: * Probe all of a fileserver's addresses to find out the best route and to * query its capabilities. */ -void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, - struct afs_addr_list *new_alist, struct key *key) +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key) { struct afs_endpoint_state *estate, *old; - struct afs_addr_list *alist; + struct afs_addr_list *old_alist = NULL, *alist; unsigned long unprobed; _enter("%pU", &server->uuid); estate = kzalloc(sizeof(*estate), GFP_KERNEL); if (!estate) - return; + return -ENOMEM; - refcount_set(&estate->ref, 1); + refcount_set(&estate->ref, 2); estate->server_id = server->debug_id; estate->rtt = UINT_MAX; @@ -256,21 +256,31 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, old = rcu_dereference_protected(server->endpoint_state, lockdep_is_held(&server->fs_lock)); - estate->responsive_set = old->responsive_set; - estate->addresses = afs_get_addrlist(new_alist ?: old->addresses, - afs_alist_trace_get_estate); + if (old) { + estate->responsive_set = old->responsive_set; + if (!new_alist) + new_alist = old->addresses; + } + + if (old_alist != new_alist) + afs_set_peer_appdata(server, old_alist, new_alist); + + estate->addresses = afs_get_addrlist(new_alist, afs_alist_trace_get_estate); alist = estate->addresses; estate->probe_seq = ++server->probe_counter; atomic_set(&estate->nr_probing, alist->nr_addrs); + if (new_alist) + server->addr_version = new_alist->version; rcu_assign_pointer(server->endpoint_state, estate); - set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); write_unlock(&server->fs_lock); + if (old) + set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), afs_estate_trace_alloc_probe); - afs_get_address_preferences(net, alist); + afs_get_address_preferences(net, new_alist); server->probed_at = jiffies; unprobed = (1UL << alist->nr_addrs) - 1; @@ -293,6 +303,8 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, } afs_put_endpoint_state(old, afs_estate_trace_put_probe); + afs_put_endpoint_state(estate, afs_estate_trace_put_probe); + return 0; } /* diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 1d9ecd5418d8..bc9556991d7c 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1653,7 +1653,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, bp = call->request; *bp++ = htonl(FSGIVEUPALLCALLBACKS); - call->server = afs_use_server(server, afs_server_trace_give_up_cb); + call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb); afs_make_call(call, GFP_NOFS); afs_wait_for_call_to_complete(call); ret = call->error; @@ -1760,7 +1760,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, return false; call->key = key; - call->server = afs_use_server(server, afs_server_trace_get_caps); + call->server = afs_use_server(server, false, afs_server_trace_use_get_caps); call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer); call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps); call->probe_index = addr_index; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index df30bd62da79..440b0e731093 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -287,9 +287,8 @@ struct afs_net { /* Cell database */ struct rb_root cells; + struct idr cells_dyn_ino; /* cell->dynroot_ino mapping */ struct afs_cell __rcu *ws_cell; - struct work_struct cells_manager; - struct timer_list cells_timer; atomic_t cells_outstanding; struct rw_semaphore cells_lock; struct mutex cells_alias_lock; @@ -301,18 +300,11 @@ struct afs_net { * cell, but in practice, people create aliases and subsets and there's * no easy way to distinguish them. */ - seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */ - struct rb_root fs_servers; /* afs_server (by server UUID or address) */ + seqlock_t fs_lock; /* For fs_probe_*, fs_proc */ struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ - struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ - seqlock_t fs_addr_lock; /* For fs_addresses[46] */ - - struct work_struct fs_manager; - struct timer_list fs_timer; - struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; @@ -345,13 +337,10 @@ struct afs_net { extern const char afs_init_sysname[]; enum afs_cell_state { - AFS_CELL_UNSET, - AFS_CELL_ACTIVATING, + AFS_CELL_SETTING_UP, AFS_CELL_ACTIVE, - AFS_CELL_DEACTIVATING, - AFS_CELL_INACTIVE, - AFS_CELL_FAILED, - AFS_CELL_REMOVED, + AFS_CELL_REMOVING, + AFS_CELL_DEAD, }; /* @@ -382,7 +371,9 @@ struct afs_cell { struct afs_cell *alias_of; /* The cell this is an alias of */ struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ + struct work_struct destroyer; /* Destroyer for cell */ struct work_struct manager; /* Manager for init/deinit/dns */ + struct timer_list management_timer; /* General management timer */ struct hlist_node proc_link; /* /proc cell list link */ time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ @@ -398,6 +389,7 @@ struct afs_cell { enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ unsigned int debug_id; + unsigned int dynroot_ino; /* Inode numbers for dynroot (a pair) */ /* The volumes belonging to this cell */ struct rw_semaphore vs_lock; /* Lock for server->volumes */ @@ -407,7 +399,7 @@ struct afs_cell { /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ - seqlock_t fs_lock; /* For fs_servers */ + struct rw_semaphore fs_lock; /* For fs_servers */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ @@ -542,22 +534,22 @@ struct afs_server { }; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ - struct rb_node uuid_rb; /* Link in net->fs_servers */ - struct afs_server __rcu *uuid_next; /* Next server with same UUID */ - struct afs_server *uuid_prev; /* Previous server with same UUID */ - struct list_head probe_link; /* Link in net->fs_probe_list */ - struct hlist_node addr_link; /* Link in net->fs_addresses6 */ + struct rb_node uuid_rb; /* Link in cell->fs_servers */ + struct list_head probe_link; /* Link in net->fs_probe_* */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ - struct afs_server *gc_next; /* Next server in manager's list */ + struct work_struct destroyer; /* Work item to try and destroy a server */ + struct timer_list timer; /* Management timer */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ #define AFS_SERVER_FL_UPDATING 1 #define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ -#define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */ -#define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */ -#define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */ +#define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */ +#define AFS_SERVER_FL_CREATING 4 /* The record is being created */ +#define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */ +#define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */ +#define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ @@ -567,6 +559,7 @@ struct afs_server { atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ u16 service_id; /* Service ID we're using. */ + short create_error; /* Creation error */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ @@ -621,6 +614,7 @@ struct afs_volume { afs_volid_t vid; /* The volume ID of this volume */ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; + unsigned int debug_id; /* Debugging ID for traces */ time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ @@ -700,7 +694,6 @@ struct afs_vnode { #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ -#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ @@ -1008,6 +1001,9 @@ extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, __be32 xdr, u16 port); extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, __be32 *xdr, u16 port); +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist); /* * addr_prefs.c @@ -1044,16 +1040,17 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); -extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, - const char *, bool); +struct afs_cell *afs_lookup_cell(struct afs_net *net, + const char *name, unsigned int namesz, + const char *vllist, bool excl, + enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); -extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace); +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); -extern void afs_manage_cells(struct work_struct *); -extern void afs_cells_timer(struct timer_list *); +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs); extern void __net_exit afs_cell_purge(struct afs_net *); /* @@ -1111,11 +1108,7 @@ extern int afs_silly_iput(struct dentry *, struct inode *); extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; -extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); -extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *); -extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *); -extern int afs_dynroot_populate(struct super_block *); -extern void afs_dynroot_depopulate(struct super_block *); +struct inode *afs_dynroot_iget_root(struct super_block *sb); /* * file.c @@ -1207,8 +1200,8 @@ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *est enum afs_estate_trace where); void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); -void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, - struct afs_addr_list *new_addrs, struct key *key); +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key); int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); @@ -1228,7 +1221,6 @@ int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); -extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, @@ -1510,20 +1502,30 @@ extern void __exit afs_clean_up_permit_cache(void); */ extern spinlock_t afs_server_peer_lock; -extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *); -extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); +struct afs_server *afs_find_server(const struct rxrpc_peer *peer); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); -extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace); -extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace); -extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace); +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason); +void afs_unuse_server(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); +void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); -extern void afs_manage_servers(struct work_struct *); -extern void afs_servers_timer(struct timer_list *); +void afs_purge_servers(struct afs_cell *cell); extern void afs_fs_probe_timer(struct timer_list *); -extern void __net_exit afs_purge_servers(struct afs_net *); +void __net_exit afs_wait_for_servers(struct afs_net *net); bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); +static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace) +{ + int r = refcount_read(&server->ref); + int a = atomic_read(&server->active); + + trace_afs_server(server->debug_id, r, a, trace); + +} + static inline void afs_inc_servers_outstanding(struct afs_net *net) { atomic_inc(&net->servers_outstanding); diff --git a/fs/afs/main.c b/fs/afs/main.c index 1ae0067f772d..c845c5daaeba 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -76,25 +76,17 @@ static int __net_init afs_net_init(struct net *net_ns) mutex_init(&net->socket_mutex); net->cells = RB_ROOT; + idr_init(&net->cells_dyn_ino); init_rwsem(&net->cells_lock); - INIT_WORK(&net->cells_manager, afs_manage_cells); - timer_setup(&net->cells_timer, afs_cells_timer, 0); - mutex_init(&net->cells_alias_lock); mutex_init(&net->proc_cells_lock); INIT_HLIST_HEAD(&net->proc_cells); seqlock_init(&net->fs_lock); - net->fs_servers = RB_ROOT; INIT_LIST_HEAD(&net->fs_probe_fast); INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); - INIT_HLIST_HEAD(&net->fs_addresses); - seqlock_init(&net->fs_addr_lock); - - INIT_WORK(&net->fs_manager, afs_manage_servers); - timer_setup(&net->fs_timer, afs_servers_timer, 0); INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); atomic_set(&net->servers_outstanding, 1); @@ -130,13 +122,14 @@ error_open_socket: net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); - afs_purge_servers(net); + afs_wait_for_servers(net); error_cell_init: net->live = false; afs_proc_cleanup(net); error_proc: afs_put_sysnames(net->sysnames); error_sysnames: + idr_destroy(&net->cells_dyn_ino); net->live = false; return ret; } @@ -151,10 +144,11 @@ static void __net_exit afs_net_exit(struct net *net_ns) net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); - afs_purge_servers(net); + afs_wait_for_servers(net); afs_close_socket(net); afs_proc_cleanup(net); afs_put_sysnames(net->sysnames); + idr_destroy(&net->cells_dyn_ino); kfree_rcu(rcu_access_pointer(net->address_prefs), rcu); } diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 507c25a5b2cb..45cee6534122 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -87,7 +87,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ctx->force = true; } if (ctx->cell) { - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_mntpt); ctx->cell = NULL; } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { @@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size > AFS_MAXCELLNAME) return -ENAMETOOLONG; - cell = afs_lookup_cell(ctx->net, p, size, NULL, false); + cell = afs_lookup_cell(ctx->net, p, size, NULL, false, + afs_cell_trace_use_lookup_mntpt); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); return PTR_ERR(cell); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 12c88d8be3fe..40e879c8ca77 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -122,14 +122,15 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_lookup_cell(net, name, strlen(name), args, true); + cell = afs_lookup_cell(net, name, strlen(name), args, true, + afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); goto done; } if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin); + afs_unuse_cell(cell, afs_cell_trace_unuse_no_pin); } else { goto inval; } @@ -443,8 +444,6 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) } server = list_entry(v, struct afs_server, proc_link); - estate = rcu_dereference(server->endpoint_state); - alist = estate->addresses; seq_printf(m, "%pU %3d %3d %s\n", &server->uuid, refcount_read(&server->ref), @@ -454,10 +453,16 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) server->flags, server->rtt); seq_printf(m, " - probe: last=%d\n", (int)(jiffies - server->probed_at) / HZ); + + estate = rcu_dereference(server->endpoint_state); + if (!estate) + goto out; failed = estate->failed_set; seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n", estate->probe_seq, atomic_read(&estate->nr_probing), estate->responsive_set, estate->failed_set); + + alist = estate->addresses; seq_printf(m, " - ALIST v=%u ap=%u\n", alist->version, alist->addr_pref_version); for (i = 0; i < alist->nr_addrs; i++) { @@ -470,6 +475,8 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) rxrpc_kernel_get_srtt(addr->peer), addr->last_error, addr->prio); } + +out: return 0; } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 886416ea1d96..d5e480a33859 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -179,7 +179,7 @@ static void afs_free_call(struct afs_call *call) if (call->type->destructor) call->type->destructor(call); - afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); + afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call); kfree(call->request); o = atomic_read(&net->nr_outstanding_calls); @@ -766,8 +766,14 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long user_call_ID) { + struct afs_call *call = (struct afs_call *)user_call_ID; struct afs_net *net = afs_sock2net(sk); + call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall); + call->server = afs_find_server(call->peer); + if (!call->server) + trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer)); + queue_work(afs_wq, &net->charge_preallocation_work); } diff --git a/fs/afs/server.c b/fs/afs/server.c index 4504e16b458c..c530d1ca15df 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -14,190 +14,104 @@ static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ static atomic_t afs_server_debug_id; -static struct afs_server *afs_maybe_use_server(struct afs_server *, - enum afs_server_trace); static void __afs_put_server(struct afs_net *, struct afs_server *); +static void afs_server_timer(struct timer_list *timer); +static void afs_server_destroyer(struct work_struct *work); /* * Find a server by one of its addresses. */ -struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer) +struct afs_server *afs_find_server(const struct rxrpc_peer *peer) { - const struct afs_endpoint_state *estate; - const struct afs_addr_list *alist; - struct afs_server *server = NULL; - unsigned int i; - int seq = 1; + struct afs_server *server = (struct afs_server *)rxrpc_kernel_get_peer_data(peer); - rcu_read_lock(); - - do { - if (server) - afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq); - server = NULL; - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - - hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) { - estate = rcu_dereference(server->endpoint_state); - alist = estate->addresses; - for (i = 0; i < alist->nr_addrs; i++) - if (alist->addrs[i].peer == peer) - goto found; - } - - server = NULL; - continue; - found: - server = afs_maybe_use_server(server, afs_server_trace_get_by_addr); - - } while (need_seqretry(&net->fs_addr_lock, seq)); - - done_seqretry(&net->fs_addr_lock, seq); - - rcu_read_unlock(); - return server; + if (!server) + return NULL; + return afs_use_server(server, false, afs_server_trace_use_cm_call); } /* - * Look up a server by its UUID and mark it active. + * Look up a server by its UUID and mark it active. The caller must hold + * cell->fs_lock. */ -struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid) +static struct afs_server *afs_find_server_by_uuid(struct afs_cell *cell, const uuid_t *uuid) { - struct afs_server *server = NULL; + struct afs_server *server; struct rb_node *p; - int diff, seq = 1; + int diff; _enter("%pU", uuid); - do { - /* Unfortunately, rbtree walking doesn't give reliable results - * under just the RCU read lock, so we have to check for - * changes. - */ - if (server) - afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq); - server = NULL; - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - read_seqbegin_or_lock(&net->fs_lock, &seq); - - p = net->fs_servers.rb_node; - while (p) { - server = rb_entry(p, struct afs_server, uuid_rb); - - diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); - if (diff < 0) { - p = p->rb_left; - } else if (diff > 0) { - p = p->rb_right; - } else { - afs_use_server(server, afs_server_trace_get_by_uuid); - break; - } - - server = NULL; - } - } while (need_seqretry(&net->fs_lock, seq)); + p = cell->fs_servers.rb_node; + while (p) { + server = rb_entry(p, struct afs_server, uuid_rb); - done_seqretry(&net->fs_lock, seq); + diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); + if (diff < 0) { + p = p->rb_left; + } else if (diff > 0) { + p = p->rb_right; + } else { + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) + return NULL; /* Need a write lock */ + afs_use_server(server, true, afs_server_trace_use_by_uuid); + return server; + } + } - _leave(" = %p", server); - return server; + return NULL; } /* - * Install a server record in the namespace tree. If there's a clash, we stick - * it into a list anchored on whichever afs_server struct is actually in the - * tree. + * Install a server record in the cell tree. The caller must hold an exclusive + * lock on cell->fs_lock. */ static struct afs_server *afs_install_server(struct afs_cell *cell, - struct afs_server *candidate) + struct afs_server **candidate) { - const struct afs_endpoint_state *estate; - const struct afs_addr_list *alist; - struct afs_server *server, *next; + struct afs_server *server; struct afs_net *net = cell->net; struct rb_node **pp, *p; int diff; _enter("%p", candidate); - write_seqlock(&net->fs_lock); - /* Firstly install the server in the UUID lookup tree */ - pp = &net->fs_servers.rb_node; + pp = &cell->fs_servers.rb_node; p = NULL; while (*pp) { p = *pp; _debug("- consider %p", p); server = rb_entry(p, struct afs_server, uuid_rb); - diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t)); - if (diff < 0) { + diff = memcmp(&(*candidate)->uuid, &server->uuid, sizeof(uuid_t)); + if (diff < 0) pp = &(*pp)->rb_left; - } else if (diff > 0) { + else if (diff > 0) pp = &(*pp)->rb_right; - } else { - if (server->cell == cell) - goto exists; - - /* We have the same UUID representing servers in - * different cells. Append the new server to the list. - */ - for (;;) { - next = rcu_dereference_protected( - server->uuid_next, - lockdep_is_held(&net->fs_lock.lock)); - if (!next) - break; - server = next; - } - rcu_assign_pointer(server->uuid_next, candidate); - candidate->uuid_prev = server; - server = candidate; - goto added_dup; - } + else + goto exists; } - server = candidate; + server = *candidate; + *candidate = NULL; rb_link_node(&server->uuid_rb, p, pp); - rb_insert_color(&server->uuid_rb, &net->fs_servers); + rb_insert_color(&server->uuid_rb, &cell->fs_servers); + write_seqlock(&net->fs_lock); hlist_add_head_rcu(&server->proc_link, &net->fs_proc); + write_sequnlock(&net->fs_lock); afs_get_cell(cell, afs_cell_trace_get_server); -added_dup: - write_seqlock(&net->fs_addr_lock); - estate = rcu_dereference_protected(server->endpoint_state, - lockdep_is_held(&net->fs_addr_lock.lock)); - alist = estate->addresses; - - /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install - * it in the IPv4 and/or IPv6 reverse-map lists. - * - * TODO: For speed we want to use something other than a flat list - * here; even sorting the list in terms of lowest address would help a - * bit, but anything we might want to do gets messy and memory - * intensive. - */ - if (alist->nr_addrs > 0) - hlist_add_head_rcu(&server->addr_link, &net->fs_addresses); - - write_sequnlock(&net->fs_addr_lock); - exists: - afs_get_server(server, afs_server_trace_get_install); - write_sequnlock(&net->fs_lock); + afs_use_server(server, true, afs_server_trace_use_install); return server; } /* - * Allocate a new server record and mark it active. + * Allocate a new server record and mark it as active but uncreated. */ -static struct afs_server *afs_alloc_server(struct afs_cell *cell, - const uuid_t *uuid, - struct afs_addr_list *alist) +static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid) { - struct afs_endpoint_state *estate; struct afs_server *server; struct afs_net *net = cell->net; @@ -205,65 +119,49 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, server = kzalloc(sizeof(struct afs_server), GFP_KERNEL); if (!server) - goto enomem; - - estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL); - if (!estate) - goto enomem_server; + return NULL; refcount_set(&server->ref, 1); - atomic_set(&server->active, 1); + atomic_set(&server->active, 0); + __set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); server->debug_id = atomic_inc_return(&afs_server_debug_id); - server->addr_version = alist->version; server->uuid = *uuid; rwlock_init(&server->fs_lock); + INIT_WORK(&server->destroyer, &afs_server_destroyer); + timer_setup(&server->timer, afs_server_timer, 0); INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); INIT_LIST_HEAD(&server->probe_link); + INIT_HLIST_NODE(&server->proc_link); spin_lock_init(&server->probe_lock); server->cell = cell; server->rtt = UINT_MAX; server->service_id = FS_SERVICE; - server->probe_counter = 1; server->probed_at = jiffies - LONG_MAX / 2; - refcount_set(&estate->ref, 1); - estate->addresses = alist; - estate->server_id = server->debug_id; - estate->probe_seq = 1; - rcu_assign_pointer(server->endpoint_state, estate); afs_inc_servers_outstanding(net); - trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc); - trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), - afs_estate_trace_alloc_server); _leave(" = %p", server); return server; - -enomem_server: - kfree(server); -enomem: - _leave(" = NULL [nomem]"); - return NULL; } /* * Look up an address record for a server */ -static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, - struct key *key, const uuid_t *uuid) +static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_server *server, + struct key *key) { struct afs_vl_cursor vc; struct afs_addr_list *alist = NULL; int ret; ret = -ERESTARTSYS; - if (afs_begin_vlserver_operation(&vc, cell, key)) { + if (afs_begin_vlserver_operation(&vc, server->cell, key)) { while (afs_select_vlserver(&vc)) { if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) - alist = afs_yfsvl_get_endpoints(&vc, uuid); + alist = afs_yfsvl_get_endpoints(&vc, &server->uuid); else - alist = afs_vl_get_addrs_u(&vc, uuid); + alist = afs_vl_get_addrs_u(&vc, &server->uuid); } ret = afs_end_vlserver_operation(&vc); @@ -273,72 +171,122 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, } /* - * Get or create a fileserver record. + * Get or create a fileserver record and return it with an active-use count on + * it. */ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, const uuid_t *uuid, u32 addr_version) { - struct afs_addr_list *alist; - struct afs_server *server, *candidate; + struct afs_addr_list *alist = NULL; + struct afs_server *server, *candidate = NULL; + bool creating = false; + int ret; _enter("%p,%pU", cell->net, uuid); - server = afs_find_server_by_uuid(cell->net, uuid); + down_read(&cell->fs_lock); + server = afs_find_server_by_uuid(cell, uuid); + /* Won't see servers marked uncreated. */ + up_read(&cell->fs_lock); + if (server) { + timer_delete_sync(&server->timer); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) + goto wait_for_creation; if (server->addr_version != addr_version) set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); return server; } - alist = afs_vl_lookup_addrs(cell, key, uuid); - if (IS_ERR(alist)) - return ERR_CAST(alist); - - candidate = afs_alloc_server(cell, uuid, alist); + candidate = afs_alloc_server(cell, uuid); if (!candidate) { afs_put_addrlist(alist, afs_alist_trace_put_server_oom); return ERR_PTR(-ENOMEM); } - server = afs_install_server(cell, candidate); - if (server != candidate) { - afs_put_addrlist(alist, afs_alist_trace_put_server_dup); + down_write(&cell->fs_lock); + server = afs_install_server(cell, &candidate); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) { + /* We need to wait for creation to complete. */ + up_write(&cell->fs_lock); + goto wait_for_creation; + } + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + set_bit(AFS_SERVER_FL_CREATING, &server->flags); + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; + } + up_write(&cell->fs_lock); + timer_delete_sync(&server->timer); + + /* If we get to create the server, we look up the addresses and then + * immediately dispatch an asynchronous probe to each interface on the + * fileserver. This will make sure the repeat-probing service is + * started. + */ + if (creating) { + alist = afs_vl_lookup_addrs(server, key); + if (IS_ERR(alist)) { + ret = PTR_ERR(alist); + goto create_failed; + } + + ret = afs_fs_probe_fileserver(cell->net, server, alist, key); + if (ret) + goto create_failed; + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); + } + +out: + afs_put_addrlist(alist, afs_alist_trace_put_server_create); + if (candidate) { + kfree(rcu_access_pointer(server->endpoint_state)); kfree(candidate); - } else { - /* Immediately dispatch an asynchronous probe to each interface - * on the fileserver. This will make sure the repeat-probing - * service is started. - */ - afs_fs_probe_fileserver(cell->net, server, alist, key); + afs_dec_servers_outstanding(cell->net); + } + return server ?: ERR_PTR(ret); + +wait_for_creation: + afs_see_server(server, afs_server_trace_wait_create); + wait_on_bit(&server->flags, AFS_SERVER_FL_CREATING, TASK_UNINTERRUPTIBLE); + if (test_bit_acquire(AFS_SERVER_FL_UNCREATED, &server->flags)) { + /* Barrier: read flag before error */ + ret = READ_ONCE(server->create_error); + afs_put_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; + goto out; } - return server; -} + ret = 0; + goto out; -/* - * Set the server timer to fire after a given delay, assuming it's not already - * set for an earlier time. - */ -static void afs_set_server_timer(struct afs_net *net, time64_t delay) -{ - if (net->live) { - afs_inc_servers_outstanding(net); - if (timer_reduce(&net->fs_timer, jiffies + delay * HZ)) - afs_dec_servers_outstanding(net); +create_failed: + down_write(&cell->fs_lock); + + WRITE_ONCE(server->create_error, ret); + smp_wmb(); /* Barrier: set error before flag. */ + set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); + + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; } + afs_unuse_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; + + up_write(&cell->fs_lock); + goto out; } /* - * Server management timer. We have an increment on fs_outstanding that we - * need to pass along to the work item. + * Set/reduce a server's timer. */ -void afs_servers_timer(struct timer_list *timer) +static void afs_set_server_timer(struct afs_server *server, unsigned int delay_secs) { - struct afs_net *net = container_of(timer, struct afs_net, fs_timer); - - _enter(""); - if (!queue_work(afs_wq, &net->fs_manager)) - afs_dec_servers_outstanding(net); + mod_timer(&server->timer, jiffies + delay_secs * HZ); } /* @@ -357,32 +305,20 @@ struct afs_server *afs_get_server(struct afs_server *server, } /* - * Try to get a reference on a server object. + * Get an active count on a server object and maybe remove from the inactive + * list. */ -static struct afs_server *afs_maybe_use_server(struct afs_server *server, - enum afs_server_trace reason) -{ - unsigned int a; - int r; - - if (!__refcount_inc_not_zero(&server->ref, &r)) - return NULL; - - a = atomic_inc_return(&server->active); - trace_afs_server(server->debug_id, r + 1, a, reason); - return server; -} - -/* - * Get an active count on a server object. - */ -struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason) +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason) { unsigned int a; int r; __refcount_inc(&server->ref, &r); a = atomic_inc_return(&server->active); + if (a == 1 && activate && + !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + del_timer(&server->timer); trace_afs_server(server->debug_id, r + 1, a, reason); return server; @@ -415,13 +351,16 @@ void afs_put_server(struct afs_net *net, struct afs_server *server, void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - if (server) { - unsigned int active = atomic_dec_return(&server->active); + if (!server) + return; - if (active == 0) - afs_set_server_timer(net, afs_server_gc_delay); - afs_put_server(net, server, reason); + if (atomic_dec_and_test(&server->active)) { + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) || + READ_ONCE(server->cell->state) >= AFS_CELL_REMOVING) + schedule_work(&server->destroyer); } + + afs_put_server(net, server, reason); } /* @@ -430,10 +369,22 @@ void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, void afs_unuse_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - if (server) { - server->unuse_time = ktime_get_real_seconds(); - afs_unuse_server_notime(net, server, reason); + if (!server) + return; + + if (atomic_dec_and_test(&server->active)) { + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) && + READ_ONCE(server->cell->state) < AFS_CELL_REMOVING) { + time64_t unuse_time = ktime_get_real_seconds(); + + server->unuse_time = unuse_time; + afs_set_server_timer(server, afs_server_gc_delay); + } else { + schedule_work(&server->destroyer); + } } + + afs_put_server(net, server, reason); } static void afs_server_rcu(struct rcu_head *rcu) @@ -463,159 +414,119 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server } /* - * destroy a dead server + * Check to see if the server record has expired. */ -static void afs_destroy_server(struct afs_net *net, struct afs_server *server) +static bool afs_has_server_expired(const struct afs_server *server) { - if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) - afs_give_up_callbacks(net, server); + time64_t expires_at; - afs_put_server(net, server, afs_server_trace_destroy); + if (atomic_read(&server->active)) + return false; + + if (server->cell->net->live || + server->cell->state >= AFS_CELL_REMOVING) { + trace_afs_server(server->debug_id, refcount_read(&server->ref), + 0, afs_server_trace_purging); + return true; + } + + expires_at = server->unuse_time; + if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && + !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) + expires_at += afs_server_gc_delay; + + return ktime_get_real_seconds() > expires_at; } /* - * Garbage collect any expired servers. + * Remove a server record from it's parent cell's database. */ -static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) +static bool afs_remove_server_from_cell(struct afs_server *server) { - struct afs_server *server, *next, *prev; - int active; - - while ((server = gc_list)) { - gc_list = server->gc_next; - - write_seqlock(&net->fs_lock); - - active = atomic_read(&server->active); - if (active == 0) { - trace_afs_server(server->debug_id, refcount_read(&server->ref), - active, afs_server_trace_gc); - next = rcu_dereference_protected( - server->uuid_next, lockdep_is_held(&net->fs_lock.lock)); - prev = server->uuid_prev; - if (!prev) { - /* The one at the front is in the tree */ - if (!next) { - rb_erase(&server->uuid_rb, &net->fs_servers); - } else { - rb_replace_node_rcu(&server->uuid_rb, - &next->uuid_rb, - &net->fs_servers); - next->uuid_prev = NULL; - } - } else { - /* This server is not at the front */ - rcu_assign_pointer(prev->uuid_next, next); - if (next) - next->uuid_prev = prev; - } - - list_del(&server->probe_link); - hlist_del_rcu(&server->proc_link); - if (!hlist_unhashed(&server->addr_link)) - hlist_del_rcu(&server->addr_link); - } - write_sequnlock(&net->fs_lock); + struct afs_cell *cell = server->cell; + + down_write(&cell->fs_lock); - if (active == 0) - afs_destroy_server(net, server); + if (!afs_has_server_expired(server)) { + up_write(&cell->fs_lock); + return false; } + + set_bit(AFS_SERVER_FL_EXPIRED, &server->flags); + _debug("expire %pU %u", &server->uuid, atomic_read(&server->active)); + afs_see_server(server, afs_server_trace_see_expired); + rb_erase(&server->uuid_rb, &cell->fs_servers); + up_write(&cell->fs_lock); + return true; } -/* - * Manage the records of servers known to be within a network namespace. This - * includes garbage collecting unused servers. - * - * Note also that we were given an increment on net->servers_outstanding by - * whoever queued us that we need to deal with before returning. - */ -void afs_manage_servers(struct work_struct *work) +static void afs_server_destroyer(struct work_struct *work) { - struct afs_net *net = container_of(work, struct afs_net, fs_manager); - struct afs_server *gc_list = NULL; - struct rb_node *cursor; - time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; - bool purging = !net->live; - - _enter(""); + struct afs_endpoint_state *estate; + struct afs_server *server = container_of(work, struct afs_server, destroyer); + struct afs_net *net = server->cell->net; - /* Trawl the server list looking for servers that have expired from - * lack of use. - */ - read_seqlock_excl(&net->fs_lock); + afs_see_server(server, afs_server_trace_see_destroyer); - for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) { - struct afs_server *server = - rb_entry(cursor, struct afs_server, uuid_rb); - int active = atomic_read(&server->active); + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + return; - _debug("manage %pU %u", &server->uuid, active); + if (!afs_remove_server_from_cell(server)) + return; - if (purging) { - trace_afs_server(server->debug_id, refcount_read(&server->ref), - active, afs_server_trace_purging); - if (active != 0) - pr_notice("Can't purge s=%08x\n", server->debug_id); - } + timer_shutdown_sync(&server->timer); + cancel_work(&server->destroyer); - if (active == 0) { - time64_t expire_at = server->unuse_time; - - if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && - !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) - expire_at += afs_server_gc_delay; - if (purging || expire_at <= now) { - server->gc_next = gc_list; - gc_list = server; - } else if (expire_at < next_manage) { - next_manage = expire_at; - } - } - } + if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) + afs_give_up_callbacks(net, server); - read_sequnlock_excl(&net->fs_lock); + /* Unbind the rxrpc_peer records from the server. */ + estate = rcu_access_pointer(server->endpoint_state); + if (estate) + afs_set_peer_appdata(server, estate->addresses, NULL); - /* Update the timer on the way out. We have to pass an increment on - * servers_outstanding in the namespace that we are in to the timer or - * the work scheduler. - */ - if (!purging && next_manage < TIME64_MAX) { - now = ktime_get_real_seconds(); + write_seqlock(&net->fs_lock); + list_del_init(&server->probe_link); + if (!hlist_unhashed(&server->proc_link)) + hlist_del_rcu(&server->proc_link); + write_sequnlock(&net->fs_lock); - if (next_manage - now <= 0) { - if (queue_work(afs_wq, &net->fs_manager)) - afs_inc_servers_outstanding(net); - } else { - afs_set_server_timer(net, next_manage - now); - } - } + afs_put_server(net, server, afs_server_trace_destroy); +} - afs_gc_servers(net, gc_list); +static void afs_server_timer(struct timer_list *timer) +{ + struct afs_server *server = container_of(timer, struct afs_server, timer); - afs_dec_servers_outstanding(net); - _leave(" [%d]", atomic_read(&net->servers_outstanding)); + afs_see_server(server, afs_server_trace_see_timer); + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + schedule_work(&server->destroyer); } -static void afs_queue_server_manager(struct afs_net *net) +/* + * Wake up all the servers in a cell so that they can purge themselves. + */ +void afs_purge_servers(struct afs_cell *cell) { - afs_inc_servers_outstanding(net); - if (!queue_work(afs_wq, &net->fs_manager)) - afs_dec_servers_outstanding(net); + struct afs_server *server; + struct rb_node *rb; + + down_read(&cell->fs_lock); + for (rb = rb_first(&cell->fs_servers); rb; rb = rb_next(rb)) { + server = rb_entry(rb, struct afs_server, uuid_rb); + afs_see_server(server, afs_server_trace_see_purge); + schedule_work(&server->destroyer); + } + up_read(&cell->fs_lock); } /* - * Purge list of servers. + * Wait for outstanding servers. */ -void afs_purge_servers(struct afs_net *net) +void afs_wait_for_servers(struct afs_net *net) { _enter(""); - if (del_timer_sync(&net->fs_timer)) - afs_dec_servers_outstanding(net); - - afs_queue_server_manager(net); - - _debug("wait"); atomic_dec(&net->servers_outstanding); wait_var_event(&net->servers_outstanding, !atomic_read(&net->servers_outstanding)); @@ -639,7 +550,7 @@ static noinline bool afs_update_server_record(struct afs_operation *op, atomic_read(&server->active), afs_server_trace_update); - alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid); + alist = afs_vl_lookup_addrs(server, op->key); if (IS_ERR(alist)) { rcu_read_lock(); estate = rcu_dereference(server->endpoint_state); diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index d20cd902ef94..20d5474837df 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -16,7 +16,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) if (slist && refcount_dec_and_test(&slist->usage)) { for (i = 0; i < slist->nr_servers; i++) afs_unuse_server(net, slist->servers[i].server, - afs_server_trace_put_slist); + afs_server_trace_unuse_slist); kfree_rcu(slist, rcu); } } @@ -97,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { - afs_unuse_server(volume->cell->net, server, - afs_server_trace_put_slist_isort); + afs_unuse_server_notime(volume->cell->net, server, + afs_server_trace_unuse_slist_isort); continue; } diff --git a/fs/afs/super.c b/fs/afs/super.c index a9bee610674e..25b306db6992 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -194,8 +194,6 @@ static int afs_show_options(struct seq_file *m, struct dentry *root) if (as->dyn_root) seq_puts(m, ",dyn"); - if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags)) - seq_puts(m, ",autocell"); switch (as->flock_mode) { case afs_flock_mode_unset: break; case afs_flock_mode_local: p = "local"; break; @@ -292,13 +290,14 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) /* lookup the cell record */ if (cellname) { cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, - NULL, false); + NULL, false, + afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_parse); afs_see_cell(cell, afs_cell_trace_see_source); ctx->cell = cell; } @@ -395,7 +394,7 @@ static int afs_validate_fc(struct fs_context *fc) ctx->key = NULL; cell = afs_use_cell(ctx->cell->alias_of, afs_cell_trace_use_fc_alias); - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); ctx->cell = cell; goto reget_key; } @@ -468,7 +467,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) /* allocate the root inode and dentry */ if (as->dyn_root) { - inode = afs_iget_pseudo_dir(sb, true); + inode = afs_dynroot_iget_root(sb); } else { sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); @@ -478,9 +477,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); - if (ctx->autocell || as->dyn_root) - set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); - ret = -ENOMEM; sb->s_root = d_make_root(inode); if (!sb->s_root) @@ -488,9 +484,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) if (as->dyn_root) { sb->s_d_op = &afs_dynroot_dentry_operations; - ret = afs_dynroot_populate(sb); - if (ret < 0) - goto error; } else { sb->s_d_op = &afs_fs_dentry_operations; rcu_assign_pointer(as->volume->sb, sb); @@ -527,9 +520,8 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) static void afs_destroy_sbi(struct afs_super_info *as) { if (as) { - struct afs_net *net = afs_net(as->net_ns); afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi); - afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi); + afs_unuse_cell(as->cell, afs_cell_trace_unuse_sbi); put_net(as->net_ns); kfree(as); } @@ -539,9 +531,6 @@ static void afs_kill_super(struct super_block *sb) { struct afs_super_info *as = AFS_FS_S(sb); - if (as->dyn_root) - afs_dynroot_depopulate(sb); - /* Clear the callback interests (which will do ilookup5) before * deactivating the superblock. */ @@ -615,7 +604,7 @@ static void afs_free_fc(struct fs_context *fc) afs_destroy_sbi(fc->s_fs_info); afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc); - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); key_put(ctx->key); kfree(ctx); } diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index f9e76b604f31..709b4cdb723e 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -205,11 +205,11 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key) goto is_alias; if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) { - afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); return -ERESTARTSYS; } - afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); } mutex_unlock(&cell->net->proc_cells_lock); @@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) if (!name_len || name_len > AFS_MAXCELLNAME) master = ERR_PTR(-EOPNOTSUPP); else - master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false); + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false, + afs_cell_trace_use_lookup_canonical); kfree(cell_name); if (IS_ERR(master)) return PTR_ERR(master); diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index d8f79f6ada3d..6ad9688d8f4b 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -48,7 +48,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) cell->dns_expiry <= ktime_get_real_seconds()) { dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count); set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); - afs_queue_cell(cell, afs_cell_trace_get_queue_dns); + afs_queue_cell(cell, afs_cell_trace_queue_dns); if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { if (wait_var_event_interruptible( diff --git a/fs/afs/volume.c b/fs/afs/volume.c index af3a3f57c1b3..0efff3d25133 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -10,6 +10,7 @@ #include "internal.h" static unsigned __read_mostly afs_volume_record_life = 60 * 60; +static atomic_t afs_volume_debug_id; static void afs_destroy_volume(struct work_struct *work); @@ -59,7 +60,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume) struct afs_cell *cell = volume->cell; if (!hlist_unhashed(&volume->proc_link)) { - trace_afs_volume(volume->vid, refcount_read(&cell->ref), + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), afs_volume_trace_remove); write_seqlock(&cell->volume_lock); hlist_del_rcu(&volume->proc_link); @@ -84,6 +85,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, if (!volume) goto error_0; + volume->debug_id = atomic_inc_return(&afs_volume_debug_id); volume->vid = vldb->vid[params->type]; volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol); @@ -115,7 +117,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, *_slist = slist; rcu_assign_pointer(volume->servers, slist); - trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc); + trace_afs_volume(volume->debug_id, volume->vid, 1, afs_volume_trace_alloc); return volume; error_1: @@ -247,7 +249,7 @@ static void afs_destroy_volume(struct work_struct *work) afs_remove_volume_from_cell(volume); afs_put_serverlist(volume->cell->net, slist); afs_put_cell(volume->cell, afs_cell_trace_put_vol); - trace_afs_volume(volume->vid, refcount_read(&volume->ref), + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), afs_volume_trace_free); kfree_rcu(volume, rcu); @@ -262,7 +264,7 @@ bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason) int r; if (__refcount_inc_not_zero(&volume->ref, &r)) { - trace_afs_volume(volume->vid, r + 1, reason); + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); return true; } return false; @@ -278,7 +280,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, int r; __refcount_inc(&volume->ref, &r); - trace_afs_volume(volume->vid, r + 1, reason); + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); } return volume; } @@ -290,12 +292,13 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason) { if (volume) { + unsigned int debug_id = volume->debug_id; afs_volid_t vid = volume->vid; bool zero; int r; zero = __refcount_dec_and_test(&volume->ref, &r); - trace_afs_volume(vid, r - 1, reason); + trace_afs_volume(debug_id, vid, r - 1, reason); if (zero) schedule_work(&volume->destructor); } diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 77c7991d89aa..23cea74f9933 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -218,6 +218,8 @@ void autofs_clean_ino(struct autofs_info *); static inline int autofs_check_pipe(struct file *pipe) { + if (pipe->f_mode & FMODE_PATH) + return -EINVAL; if (!(pipe->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!S_ISFIFO(file_inode(pipe)->i_mode)) diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 6d57efbb8110..c5a6aae12d2c 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -442,7 +442,6 @@ static int autofs_dev_ioctl_timeout(struct file *fp, sbi->exp_timeout = timeout * HZ; } else { struct dentry *base = fp->f_path.dentry; - struct inode *inode = base->d_inode; int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; struct dentry *dentry; struct autofs_info *ino; @@ -460,9 +459,7 @@ static int autofs_dev_ioctl_timeout(struct file *fp, "the parent autofs mount timeout which could " "prevent shutdown\n"); - inode_lock_shared(inode); dentry = try_lookup_one_len(param->path, base, path_len); - inode_unlock_shared(inode); if (IS_ERR_OR_NULL(dentry)) return dentry ? PTR_ERR(dentry) : -ENOENT; ino = autofs_dentry_ino(dentry); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 530d18827e35..174c7205fee4 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -15,8 +15,8 @@ static int autofs_dir_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); static int autofs_dir_rmdir(struct inode *, struct dentry *); -static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); +static struct dentry *autofs_dir_mkdir(struct mnt_idmap *, struct inode *, + struct dentry *, umode_t); static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static long autofs_root_compat_ioctl(struct file *, @@ -720,9 +720,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs_dir_mkdir(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, - umode_t mode) +static struct dentry *autofs_dir_mkdir(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t mode) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); @@ -739,7 +739,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); d_add(dentry, inode); if (sbi->version < 5) @@ -751,7 +751,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, inc_nlink(dir); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - return 0; + return NULL; } /* Get/set timeout ioctl() operation */ diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 316d88da2ce1..0ef9bcb744dd 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -58,10 +58,10 @@ static int bad_inode_symlink(struct mnt_idmap *idmap, return -EIO; } -static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return -EIO; + return ERR_PTR(-EIO); } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index fc7efd0a7525..c9798750202d 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -16,7 +16,7 @@ config BCACHEFS_FS select ZSTD_COMPRESS select ZSTD_DECOMPRESS select CRYPTO - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select CRYPTO_CHACHA20 select CRYPTO_POLY1305 select KEYS diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index d2689388d5e8..9af65079374f 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -41,7 +41,6 @@ bcachefs-y := \ extent_update.o \ eytzinger.o \ fs.o \ - fs-common.o \ fs-ioctl.o \ fs-io.o \ fs-io-buffered.o \ @@ -64,9 +63,11 @@ bcachefs-y := \ migrate.o \ move.o \ movinggc.o \ + namei.o \ nocow_locking.o \ opts.o \ printbuf.o \ + progress.o \ quota.o \ rebalance.o \ rcu_pending.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 3ea809990ef1..5fb396be9127 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, int ret = 0; bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), - c, alloc_v2_unpack_error, + c, alloc_v3_unpack_error, "unpack error"); fsck_err: return ret; @@ -777,14 +777,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s s64 delta_sectors, s64 delta_fragmented, unsigned flags) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = data_type, - }; s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + d, dev_data_type, + .dev = ca->dev_idx, + .data_type = data_type); } int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, @@ -837,7 +835,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) - return -EIO; + return -BCH_ERR_trigger_alloc; struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -871,6 +869,9 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (data_type_is_empty(new_a->data_type) && BCH_ALLOC_V4_NEED_INC_GEN(new_a) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { + if (new_a->oldest_gen == new_a->gen && + !bch2_bucket_sectors_total(*new_a)) + new_a->oldest_gen++; new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); alloc_data_type_set(new_a, new_a->data_type); @@ -889,26 +890,20 @@ int bch2_trigger_alloc(struct btree_trans *trans, !new_a->io_time[READ]) new_a->io_time[READ] = bch2_current_io_time(c, READ); - u64 old_lru = alloc_lru_idx_read(*old_a); - u64 new_lru = alloc_lru_idx_read(*new_a); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new.k->p.inode, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, new.k->p.inode, + bucket_to_u64(new.k->p), + alloc_lru_idx_read(*old_a), + alloc_lru_idx_read(*new_a)); + if (ret) + goto err; - old_lru = alloc_lru_idx_fragmentation(*old_a, ca); - new_lru = alloc_lru_idx_fragmentation(*new_a, ca); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, + BCH_LRU_BUCKET_FRAGMENTATION, + bucket_to_u64(new.k->p), + alloc_lru_idx_fragmentation(*old_a, ca), + alloc_lru_idx_fragmentation(*new_a, ca)); + if (ret) + goto err; if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); @@ -1034,7 +1029,7 @@ fsck_err: invalid_bucket: bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -EIO; + ret = -BCH_ERR_trigger_alloc; goto err; } @@ -1705,7 +1700,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); if (lru_idx) { - ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, + bucket_to_u64(alloc_k.k->p), lru_idx, alloc_k, last_flushed); if (ret) goto err; @@ -1735,7 +1731,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = &a_mut->v; } - ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ], alloc_k, last_flushed); if (ret) goto err; @@ -1757,7 +1755,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: + bch2_check_stripe_to_lru_refs(c); bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); @@ -1805,6 +1804,19 @@ struct discard_buckets_state { u64 discarded; }; +/* + * This is needed because discard is both a filesystem option and a device + * option, and mount options are supposed to apply to that mount and not be + * persisted, i.e. if it's set as a mount option we can't propagate it to the + * device. + */ +static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) +{ + return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) + ? c->opts.discard + : ca->mi.discard; +} + static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, @@ -1868,7 +1880,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; *discard_pos_done = iter.pos; - if (ca->mi.discard && !c->opts.nochanges) { + if (discard_opt_enabled(c, ca) && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree @@ -1897,7 +1909,10 @@ commit: if (ret) goto out; - count_event(c, bucket_discard); + if (!fastpath) + count_event(c, bucket_discard); + else + count_event(c, bucket_discard_fast); out: fsck_err: if (discard_locked) @@ -2055,16 +2070,71 @@ put_ref: bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } +static int invalidate_one_bp(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c_backpointer bp, + struct bkey_buf *last_flushed) +{ + struct btree_iter extent_iter; + struct bkey_s_c extent_k = + bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed); + int ret = bkey_err(extent_k); + if (ret) + return ret; + + struct bkey_i *n = + bch2_bkey_make_mut(trans, &extent_iter, &extent_k, + BTREE_UPDATE_internal_snapshot_node); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); +err: + bch2_trans_iter_exit(trans, &extent_iter); + return ret; +} + +static int invalidate_one_bucket_by_bps(struct btree_trans *trans, + struct bch_dev *ca, + struct bpos bucket, + u8 gen, + struct bkey_buf *last_flushed) +{ + struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket); + struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket); + + return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + bp_start, bp_end, 0, k, + NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, ({ + if (k.k->type != KEY_TYPE_backpointer) + continue; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + if (bp.v->bucket_gen != gen) + continue; + + /* filter out bps with gens that don't match */ + + invalidate_one_bp(trans, ca, bp, last_flushed); + })); +} + +noinline_for_stack static int invalidate_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *lru_iter, struct bkey_s_c lru_k, + struct bkey_buf *last_flushed, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; - struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); - unsigned cached_sectors; + struct btree_iter alloc_iter = {}; int ret = 0; if (*nr_to_invalidate <= 0) @@ -2081,35 +2151,37 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); - ret = PTR_ERR_OR_ZERO(a); + struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, + BTREE_ITER_cached); + ret = bkey_err(alloc_k); if (ret) - goto out; + return ret; + + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); /* We expect harmless races here due to the btree write buffer: */ - if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) goto out; - BUG_ON(a->v.data_type != BCH_DATA_cached); - BUG_ON(a->v.dirty_sectors); + /* + * Impossible since alloc_lru_idx_read() only returns nonzero if the + * bucket is supposed to be on the cached bucket LRU (i.e. + * BCH_DATA_cached) + * + * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 + */ + BUG_ON(a->data_type != BCH_DATA_cached); + BUG_ON(a->dirty_sectors); - if (!a->v.cached_sectors) + if (!a->cached_sectors) bch_err(c, "invalidating empty bucket, confused"); - cached_sectors = a->v.cached_sectors; + unsigned cached_sectors = a->cached_sectors; + u8 gen = a->gen; - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - a->v.gen++; - a->v.data_type = 0; - a->v.dirty_sectors = 0; - a->v.stripe_sectors = 0; - a->v.cached_sectors = 0; - a->v.io_time[READ] = bch2_current_io_time(c, READ); - a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); - - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); if (ret) goto out; @@ -2117,6 +2189,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, --*nr_to_invalidate; out: fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; } @@ -2143,6 +2216,10 @@ static void bch2_do_invalidates_work(struct work_struct *work) struct btree_trans *trans = bch2_trans_get(c); int ret = 0; + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; @@ -2167,7 +2244,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (!k.k) break; - ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate); restart_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -2180,6 +2257,7 @@ restart_err: err: bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); + bch2_bkey_buf_exit(&last_flushed, c); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index de25ba4ee94b..c556ccaffe89 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, if (a.stripe) return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; if (bch2_bucket_sectors_dirty(a)) - return data_type; + return bucket_data_type(data_type); if (a.cached_sectors) return BCH_DATA_cached; if (BCH_ALLOC_V4_NEED_DISCARD(&a)) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 5a781fb4c794..0cac65347a5d 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) void bch2_open_bucket_write_error(struct bch_fs *c, struct open_buckets *obs, - unsigned dev) + unsigned dev, int err) { struct open_bucket *ob; unsigned i; open_bucket_for_each(c, obs, ob, i) if (ob->dev == dev && ob->ec) - bch2_ec_bucket_cancel(c, ob); + bch2_ec_bucket_cancel(c, ob, err); } static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) @@ -179,23 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -static inline unsigned open_buckets_reserved(enum bch_watermark watermark) -{ - switch (watermark) { - case BCH_WATERMARK_interior_updates: - return 0; - case BCH_WATERMARK_reclaim: - return OPEN_BUCKETS_COUNT / 6; - case BCH_WATERMARK_btree: - case BCH_WATERMARK_btree_copygc: - return OPEN_BUCKETS_COUNT / 4; - case BCH_WATERMARK_copygc: - return OPEN_BUCKETS_COUNT / 3; - default: - return OPEN_BUCKETS_COUNT / 2; - } -} - static inline bool may_alloc_bucket(struct bch_fs *c, struct bpos bucket, struct bucket_alloc_state *s) @@ -239,7 +222,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * spin_lock(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -648,7 +631,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct bch_dev_usage *usage) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); + u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; @@ -728,7 +711,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct bch_dev_usage usage; struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + cl, flags & BCH_WRITE_alloc_nowait, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -1336,7 +1319,7 @@ retry: if (wp->data_type != BCH_DATA_user) have_cache = true; - if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { + if (target && !(flags & BCH_WRITE_only_specified_devs)) { ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, @@ -1426,7 +1409,7 @@ err: if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) ret = -BCH_ERR_bucket_alloc_blocked; - if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && + if (cl && !(flags & BCH_WRITE_alloc_nowait) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) ret = -BCH_ERR_bucket_alloc_blocked; diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index f25481a0d1a0..69ec6a012898 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -33,6 +33,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) return bch2_dev_have_ref(c, ob->dev); } +static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) +{ + switch (watermark) { + case BCH_WATERMARK_interior_updates: + return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; + case BCH_WATERMARK_btree: + case BCH_WATERMARK_btree_copygc: + return OPEN_BUCKETS_COUNT / 4; + case BCH_WATERMARK_copygc: + return OPEN_BUCKETS_COUNT / 3; + default: + return OPEN_BUCKETS_COUNT / 2; + } +} + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum bch_watermark, enum bch_data_type, struct closure *); @@ -65,7 +82,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, } void bch2_open_bucket_write_error(struct bch_fs *, - struct open_buckets *, unsigned); + struct open_buckets *, unsigned, int); void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 4aa8ee026cb8..8f79f46c2a78 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -90,6 +90,7 @@ struct dev_stripe_state { x(stopped) \ x(waiting_io) \ x(waiting_work) \ + x(runnable) \ x(running) enum write_point_state { @@ -125,6 +126,7 @@ struct write_point { enum write_point_state state; u64 last_state_change; u64 time[WRITE_POINT_STATE_NR]; + u64 last_runtime; } __aligned(SMP_CACHE_BYTES); }; diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ebeb6a5ff9d2..20c497f0c2cb 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -11,6 +11,7 @@ #include "checksum.h" #include "disk_accounting.h" #include "error.h" +#include "progress.h" #include <linux/mm.h> @@ -49,6 +50,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_str(out, " data_type="); + bch2_prt_data_type(out, bp.v->data_type); prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len, @@ -244,27 +247,31 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) return bkey_s_c_null; - if (likely(!bp.v->level)) { - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, 0, - iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } + bch2_trans_node_iter_init(trans, iter, + bp.v->btree_id, + bp.v->pos, + 0, + bp.v->level, + iter_flags); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } - if (k.k && - extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) - return k; + if (k.k && + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) + return k; - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(trans, iter); + + if (!bp.v->level) { int ret = backpointer_target_not_found(trans, bp, k, last_flushed); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); + if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) + return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) return ((struct bkey_s_c) { .k = ERR_CAST(b) }); @@ -514,6 +521,22 @@ check_existing_bp: if (!other_extent.k) goto missing; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); + if (ca) { + struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); + bkey_for_each_ptr(other_extent_ptrs, ptr) + if (ptr->dev == bp->k.p.inode && + dev_ptr_stale_rcu(ca, ptr)) { + ret = drop_dev_and_update(trans, other_bp.v->btree_id, + other_extent, bp->k.p.inode); + if (ret) + goto err; + goto out; + } + } + rcu_read_unlock(); + if (bch2_extents_match(orig_k, other_extent)) { printbuf_reset(&buf); prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); @@ -590,9 +613,6 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; @@ -600,9 +620,11 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); + + bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); rcu_read_unlock(); - if (check || empty) { + if ((check || empty) && !stale) { struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); @@ -715,71 +737,6 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, return ret; } -struct progress_indicator_state { - unsigned long next_print; - u64 nodes_seen; - u64 nodes_total; - struct btree *last_node; -}; - -static inline void progress_init(struct progress_indicator_state *s, - struct bch_fs *c, - u64 btree_id_mask) -{ - memset(s, 0, sizeof(*s)); - - s->next_print = jiffies + HZ * 10; - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - if (!(btree_id_mask & BIT_ULL(i))) - continue; - - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = i, - }; - - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - s->nodes_total += div64_ul(v, btree_sectors(c)); - } -} - -static inline bool progress_update_p(struct progress_indicator_state *s) -{ - bool ret = time_after_eq(jiffies, s->next_print); - - if (ret) - s->next_print = jiffies + HZ * 10; - return ret; -} - -static void progress_update_iter(struct btree_trans *trans, - struct progress_indicator_state *s, - struct btree_iter *iter, - const char *msg) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(btree_iter_path(trans, iter))->b; - - s->nodes_seen += b != s->last_node; - s->last_node = b; - - if (progress_update_p(s)) { - struct printbuf buf = PRINTBUF; - unsigned percent = s->nodes_total - ? div64_u64(s->nodes_seen * 100, s->nodes_total) - : 0; - - prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", - msg, percent, s->nodes_seen, s->nodes_total); - bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -} - static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { @@ -787,7 +744,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct progress_indicator_state progress; int ret = 0; - progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); + bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); @@ -806,7 +763,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); @@ -827,7 +784,7 @@ enum alloc_sector_counter { ALLOC_SECTORS_NR }; -static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) +static int data_type_to_alloc_counter(enum bch_data_type t) { switch (t) { case BCH_DATA_btree: @@ -836,9 +793,10 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_cached: return ALLOC_cached; case BCH_DATA_stripe: + case BCH_DATA_parity: return ALLOC_stripe; default: - BUG(); + return -1; } } @@ -889,7 +847,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b if (bp.v->bucket_gen != a->gen) continue; - sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); + if (alloc_counter < 0) + continue; + + sectors[alloc_counter] += bp.v->bucket_len; }; bch2_trans_iter_exit(trans, &iter); if (ret) @@ -901,9 +863,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b goto err; } - /* Cached pointers don't have backpointers: */ - if (sectors[ALLOC_dirty] != a->dirty_sectors || + sectors[ALLOC_cached] != a->cached_sectors || sectors[ALLOC_stripe] != a->stripe_sectors) { if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); @@ -912,6 +873,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b } if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_cached] > a->cached_sectors || sectors[ALLOC_stripe] > a->stripe_sectors) { ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: -BCH_ERR_transaction_restart_nested; @@ -919,7 +881,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b } if (!sectors[ALLOC_dirty] && - !sectors[ALLOC_stripe]) + !sectors[ALLOC_stripe] && + !sectors[ALLOC_cached]) __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); else __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); @@ -1206,11 +1169,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); - progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); + bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, ({ - progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); + bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); check_one_backpointer(trans, start, end, k, &last_flushed); })); diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 060dad1521ee..16575dbc5736 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H -#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#ifndef _BCACHEFS_BACKPOINTERS_H +#define _BCACHEFS_BACKPOINTERS_H #include "btree_cache.h" #include "btree_iter.h" @@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, return BCH_DATA_btree; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; + if (p.has_ec) + return BCH_DATA_stripe; + if (p.ptr.cached) + return BCH_DATA_cached; + else + return BCH_DATA_user; case KEY_TYPE_stripe: { const struct bch_extent_ptr *ptr = &entry->ptr; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); @@ -147,7 +152,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bkey_i_backpointer *bp) { bkey_backpointer_init(&bp->k_i); - bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); + bp->k.p.inode = p.ptr.dev; + + if (k.k->type != KEY_TYPE_stripe) + bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; + else { + /* + * Put stripe backpointers where they won't collide with the + * extent backpointers within the stripe: + */ + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1; + } + bp->v = (struct bch_backpointer) { .btree_id = btree_id, .level = level, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 161cf2f05d2a..f52311017aee 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -203,6 +203,7 @@ #include <linux/types.h> #include <linux/workqueue.h> #include <linux/zstd.h> +#include <linux/unicode.h> #include "bcachefs_format.h" #include "btree_journal_iter_types.h" @@ -444,6 +445,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_node_sort) \ x(btree_node_read) \ x(btree_node_read_done) \ + x(btree_node_write) \ x(btree_interior_update_foreground) \ x(btree_interior_update_total) \ x(btree_gc) \ @@ -456,6 +458,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_low_on_space) \ x(blocked_journal_low_on_pin) \ x(blocked_journal_max_in_flight) \ + x(blocked_journal_max_open) \ x(blocked_key_cache_flush) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ @@ -533,6 +536,7 @@ struct bch_dev { */ struct bch_member_cpu mi; atomic64_t errors[BCH_MEMBER_ERROR_NR]; + unsigned long write_errors_start; __uuid_t uuid; char name[BDEVNAME_SIZE]; @@ -623,7 +627,8 @@ struct bch_dev { x(topology_error) \ x(errors_fixed) \ x(errors_not_fixed) \ - x(no_invalid_checks) + x(no_invalid_checks) \ + x(discard_mount_opt_set) \ enum bch_fs_flags { #define x(n) BCH_FS_##n, @@ -687,7 +692,8 @@ struct btree_trans_buf { x(gc_gens) \ x(snapshot_delete_pagecache) \ x(sysfs) \ - x(btree_write_buffer) + x(btree_write_buffer) \ + x(btree_node_scrub) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -696,6 +702,8 @@ enum bch_write_ref { BCH_WRITE_REF_NR, }; +#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0) + struct bch_fs { struct closure cl; @@ -780,6 +788,9 @@ struct bch_fs { u64 btrees_lost_data; } sb; +#ifdef CONFIG_UNICODE + struct unicode_map *cf_encoding; +#endif struct bch_sb_handle disk_sb; @@ -969,7 +980,6 @@ struct bch_fs { mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; - struct crypto_shash *sha256; struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; @@ -993,15 +1003,11 @@ struct bch_fs { wait_queue_head_t copygc_running_wq; /* STRIPES: */ - GENRADIX(struct stripe) stripes; GENRADIX(struct gc_stripe) gc_stripes; struct hlist_head ec_stripes_new[32]; spinlock_t ec_stripes_new_lock; - ec_stripes_heap ec_stripes_heap; - struct mutex ec_stripes_heap_lock; - /* ERASURE CODING */ struct list_head ec_stripe_head_list; struct mutex ec_stripe_head_lock; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f70f0108401f..e96d87767020 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -686,7 +686,12 @@ struct bch_sb_field_ext { x(inode_depth, BCH_VERSION(1, 17)) \ x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ x(autofix_errors, BCH_VERSION(1, 19)) \ - x(directory_size, BCH_VERSION(1, 20)) + x(directory_size, BCH_VERSION(1, 20)) \ + x(cached_backpointers, BCH_VERSION(1, 21)) \ + x(stripe_backpointers, BCH_VERSION(1, 22)) \ + x(stripe_lru, BCH_VERSION(1, 23)) \ + x(casefolding, BCH_VERSION(1, 24)) \ + x(extent_flags, BCH_VERSION(1, 25)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -837,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +/* one free bit */ LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); @@ -855,6 +861,8 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); +LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); +LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -908,7 +916,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(journal_no_flush, 16) \ x(alloc_v2, 17) \ x(extents_across_btree_nodes, 18) \ - x(incompat_version_field, 19) + x(incompat_version_field, 19) \ + x(casefolding, 20) #define BCH_SB_FEATURES_ALWAYS \ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ @@ -922,7 +931,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u BIT_ULL(BCH_FEATURE_new_siphash)| \ BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ BIT_ULL(BCH_FEATURE_new_varint)| \ - BIT_ULL(BCH_FEATURE_journal_no_flush)) + BIT_ULL(BCH_FEATURE_journal_no_flush)| \ + BIT_ULL(BCH_FEATURE_incompat_version_field)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index 3c23bdf788ce..52594e925eb7 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -87,6 +87,7 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) #define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) +#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) /* ioctl below act on a particular file, not the filesystem as a whole: */ @@ -215,6 +216,10 @@ struct bch_ioctl_data { union { struct { __u32 dev; + __u32 data_types; + } scrub; + struct { + __u32 dev; __u32 pad; } migrate; struct { @@ -229,6 +234,11 @@ enum bch_data_event { BCH_DATA_EVENT_NR = 1, }; +enum data_progress_data_type_special { + DATA_PROGRESS_DATA_TYPE_phys = 254, + DATA_PROGRESS_DATA_TYPE_done = 255, +}; + struct bch_ioctl_data_progress { __u8 data_type; __u8 btree_id; @@ -237,11 +247,19 @@ struct bch_ioctl_data_progress { __u64 sectors_done; __u64 sectors_total; + __u64 sectors_error_corrected; + __u64 sectors_error_uncorrected; } __packed __aligned(8); +enum bch_ioctl_data_event_ret { + BCH_IOCTL_DATA_EVENT_RET_done = 1, + BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, +}; + struct bch_ioctl_data_event { __u8 type; - __u8 pad[7]; + __u8 ret; + __u8 pad[6]; union { struct bch_ioctl_data_progress p; __u64 pad2[15]; @@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting { struct bkey_i_accounting accounting[]; }; +#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) + +struct bch_ioctl_query_counters { + __u16 nr; + __u16 flags; + __u32 pad; + __u64 d[]; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 1ec1f90e0eb3..54666027aa85 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) btree_node_write_in_flight(b)); btree_node_data_free(bc, b); + cond_resched(); } BUG_ON(!bch2_journal_error(&c->journal) && diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index dd1d9b74076e..ff681e733598 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -27,6 +27,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "progress.h" #include "recovery_passes.h" #include "reflink.h" #include "recovery.h" @@ -656,7 +657,9 @@ fsck_err: return ret; } -static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) +static int bch2_gc_btree(struct btree_trans *trans, + struct progress_indicator_state *progress, + enum btree_id btree, bool initial) { struct bch_fs *c = trans->c; unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; @@ -673,6 +676,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); })); @@ -717,22 +721,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - enum btree_id ids[BTREE_ID_NR]; struct printbuf buf = PRINTBUF; - unsigned i; int ret = 0; - for (i = 0; i < BTREE_ID_NR; i++) + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, ~0ULL); + + enum btree_id ids[BTREE_ID_NR]; + for (unsigned i = 0; i < BTREE_ID_NR; i++) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { unsigned btree = i < BTREE_ID_NR ? ids[i] : i; if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = bch2_gc_btree(trans, btree, true); + ret = bch2_gc_btree(trans, &progress, btree, true); } printbuf_exit(&buf); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 756736f9243d..2ba33ffc9795 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "bkey_sort.h" #include "btree_cache.h" @@ -1328,6 +1329,7 @@ static void btree_node_read_work(struct work_struct *work) bch_info(c, "retrying read"); ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); rb->have_ioref = ca != NULL; + rb->start_time = local_clock(); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_buf_bytes(b); @@ -1338,21 +1340,26 @@ static void btree_node_read_work(struct work_struct *work) } else { bio->bi_status = BLK_STS_REMOVED; } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); start: printbuf_reset(&buf); bch2_btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf.buf); + + if (ca && bio->bi_status) + bch_err_dev_ratelimited(ca, + "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; - bch2_mark_io_failure(&failed, &rb->pick); + bch2_mark_io_failure(&failed, &rb->pick, false); can_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - &failed, &rb->pick) > 0; + &failed, &rb->pick, -1) > 0; if (!bio->bi_status && !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { @@ -1400,12 +1407,11 @@ static void btree_node_read_endio(struct bio *bio) struct btree_read_bio *rb = container_of(bio, struct btree_read_bio, bio); struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->have_ioref + ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - if (rb->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); - - bch2_latency_acct(ca, rb->start_time, READ); - } + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); queue_work(c->btree_read_complete_wq, &rb->work); } @@ -1697,7 +1703,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, return; ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - NULL, &pick); + NULL, &pick, -1); if (ret <= 0) { struct printbuf buf = PRINTBUF; @@ -1811,6 +1817,190 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); } +struct btree_node_scrub { + struct bch_fs *c; + struct bch_dev *ca; + void *buf; + bool used_mempool; + unsigned written; + + enum btree_id btree; + unsigned level; + struct bkey_buf key; + __le64 seq; + + struct work_struct work; + struct bio bio; +}; + +static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, + struct printbuf *err) +{ + unsigned written = 0; + + if (le64_to_cpu(data->magic) != bset_magic(c)) { + prt_printf(err, "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(data->magic)); + return false; + } + + while (written < (ptr_written ?: btree_sectors(c))) { + struct btree_node_entry *bne; + struct bset *i; + bool first = !written; + + if (first) { + bne = NULL; + i = &data->keys; + } else { + bne = (void *) data + (written << 9); + i = &bne->keys; + + if (!ptr_written && i->seq != data->keys.seq) + break; + } + + struct nonce nonce = btree_nonce(i, written << 9); + bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); + + if (first) { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); + if (bch2_crc_cmp(data->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); + return false; + } + } + + written += vstruct_sectors(data, c->block_bits); + } else { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + if (bch2_crc_cmp(bne->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); + return false; + } + } + + written += vstruct_sectors(bne, c->block_bits); + } + } + + return true; +} + +static void btree_node_scrub_work(struct work_struct *work) +{ + struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); + struct bch_fs *c = scrub->c; + struct printbuf err = PRINTBUF; + + __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, + bkey_i_to_s_c(scrub->key.k)); + prt_newline(&err); + + if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { + struct btree_trans *trans = bch2_trans_get(c); + + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, scrub->btree, + scrub->key.k->k.p, 0, scrub->level - 1, 0); + + struct btree *b; + int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter))); + if (ret) + goto err; + + if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { + bch_err(c, "error validating btree node during scrub on %s at btree %s", + scrub->ca->name, err.buf); + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + bch2_trans_begin(trans); + bch2_trans_put(trans); + } + + printbuf_exit(&err); + bch2_bkey_buf_exit(&scrub->key, c);; + btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); + percpu_ref_put(&scrub->ca->io_ref); + kfree(scrub); + bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); +} + +static void btree_node_scrub_endio(struct bio *bio) +{ + struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); + + queue_work(scrub->c->btree_read_complete_wq, &scrub->work); +} + +int bch2_btree_node_scrub(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c k, unsigned dev) +{ + if (k.k->type != KEY_TYPE_btree_ptr_v2) + return 0; + + struct bch_fs *c = trans->c; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) + return -BCH_ERR_erofs_no_writes; + + struct extent_ptr_decoded pick; + int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); + if (ret <= 0) + goto err; + + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) { + ret = -BCH_ERR_device_offline; + goto err; + } + + bool used_mempool = false; + void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); + + unsigned vecs = buf_pages(buf, c->opts.btree_node_size); + + struct btree_node_scrub *scrub = + kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); + if (!scrub) { + ret = -ENOMEM; + goto err_free; + } + + scrub->c = c; + scrub->ca = ca; + scrub->buf = buf; + scrub->used_mempool = used_mempool; + scrub->written = btree_ptr_sectors_written(k); + + scrub->btree = btree; + scrub->level = level; + bch2_bkey_buf_init(&scrub->key); + bch2_bkey_buf_reassemble(&scrub->key, c, k); + scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; + + INIT_WORK(&scrub->work, btree_node_scrub_work); + + bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); + bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); + scrub->bio.bi_iter.bi_sector = pick.ptr.offset; + scrub->bio.bi_end_io = btree_node_scrub_endio; + submit_bio(&scrub->bio); + return 0; +err_free: + btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); + percpu_ref_put(&ca->io_ref); +err: + bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + return ret; +} + static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { @@ -1831,7 +2021,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, bch2_journal_pin_drop(&c->journal, &w->journal); } -static void __btree_node_write_done(struct bch_fs *c, struct btree *b) +static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) { struct btree_write *w = btree_prev_write(b); unsigned long old, new; @@ -1839,6 +2029,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) bch2_btree_complete_write(c, b, w); + if (start_time) + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); + old = READ_ONCE(b->flags); do { new = old; @@ -1869,7 +2062,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); } -static void btree_node_write_done(struct bch_fs *c, struct btree *b) +static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) { struct btree_trans *trans = bch2_trans_get(c); @@ -1877,7 +2070,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) /* we don't need transaction context anymore after we got the lock. */ bch2_trans_put(trans); - __btree_node_write_done(c, b); + __btree_node_write_done(c, b, start_time); six_unlock_read(&b->c.lock); } @@ -1887,6 +2080,7 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; + u64 start_time = wbio->start_time; int ret = 0; btree_bounce_free(c, @@ -1919,12 +2113,18 @@ static void btree_node_write_work(struct work_struct *work) } out: bio_put(&wbio->wbio.bio); - btree_node_write_done(c, b); + btree_node_write_done(c, b, start_time); return; err: set_btree_node_noevict(b); - bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, - "writing btree node: %s", bch2_err_str(ret)); + + if (!bch2_err_matches(ret, EROFS)) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); + bch2_btree_pos_to_text(&buf, c, b); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + } goto out; } @@ -1937,16 +2137,21 @@ static void btree_node_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct btree *b = wbio->bio.bi_private; struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; - unsigned long flags; - if (wbio->have_ioref) - bch2_latency_acct(ca, wbio->submit_time, WRITE); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); - if (!ca || - bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, - "btree write error: %s", - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("btree")) { + if (ca && bio->bi_status) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "btree write error: %s\n ", + bch2_blk_status_to_str(bio->bi_status)); + bch2_btree_pos_to_text(&buf, c, b); + bch_err_dev_ratelimited(ca, "%s", buf.buf); + printbuf_exit(&buf); + } + + if (bio->bi_status) { + unsigned long flags; spin_lock_irqsave(&c->btree_write_error_lock, flags); bch2_dev_list_add_dev(&orig->failed, wbio->dev); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); @@ -2023,6 +2228,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) bool validate_before_checksum = false; enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; void *data; + u64 start_time = local_clock(); int ret; if (flags & BTREE_WRITE_ALREADY_STARTED) @@ -2231,6 +2437,7 @@ do_write: wbio->data = data; wbio->data_bytes = bytes; wbio->sector_offset = b->written; + wbio->start_time = start_time; wbio->wbio.c = c; wbio->wbio.used_mempool = used_mempool; wbio->wbio.first_btree_write = !b->written; @@ -2258,7 +2465,7 @@ err: b->written += sectors_to_write; nowrite: btree_bounce_free(c, bytes, used_mempool, data); - __btree_node_write_done(c, b); + __btree_node_write_done(c, b, 0); } /* diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 6f9e4a6dacf7..dbf76d22c660 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -52,6 +52,7 @@ struct btree_write_bio { void *data; unsigned data_bytes; unsigned sector_offset; + u64 start_time; struct bch_write_bio wbio; }; @@ -132,6 +133,9 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); +int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, unsigned); + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); enum btree_write_flags { diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e32fce4fd258..7542c6f9c88e 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_peek(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->key.k.p; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b96157f3dc9c..8823eec6b284 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -335,13 +335,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra } __always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; trans->last_restarted_ip = ip; + return -err; +} + +__always_inline +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +{ + btree_trans_restart_foreign_task(trans, err, ip); #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index caef65adeae4..94eb2b73a843 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -91,10 +91,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = i->trans->locking_wait.task; + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", task ?task->pid : 0); + prt_printf(out, "%u ", task ? task->pid : 0); } prt_newline(out); } @@ -172,7 +172,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { trace_would_deadlock(g, i->trans); - return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + return btree_trans_restart_foreign_task(i->trans, + BCH_ERR_transaction_restart_would_deadlock, + _THIS_IP_); } else { i->trans->lock_must_abort = true; wake_up_process(i->trans->locking_wait.task); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index a7f06deee13c..678161321e42 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -166,11 +166,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, bn, PAGE_SIZE); + u64 submit_time = local_clock(); submit_bio_wait(bio); - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "IO error in try_read_btree_node() at %llu: %s", - offset, bch2_blk_status_to_str(bio->bi_status))) + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status)); return; + } if (le64_to_cpu(bn->magic) != bset_magic(c)) return; @@ -264,7 +270,7 @@ static int read_btree_nodes_worker(void *p) err: bio_put(bio); free_page((unsigned long) buf); - percpu_ref_get(&ca->io_ref); + percpu_ref_put(&ca->io_ref); closure_put(w->cl); kfree(w); return 0; @@ -283,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f) continue; struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); - struct task_struct *t; - if (!w) { percpu_ref_put(&ca->io_ref); ret = -ENOMEM; goto err; } - percpu_ref_get(&ca->io_ref); - closure_get(&cl); w->cl = &cl; w->f = f; w->ca = ca; - t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { percpu_ref_put(&ca->io_ref); - closure_put(&cl); - f->ret = ret; - bch_err(c, "error starting kthread: %i", ret); + kfree(w); + bch_err_msg(c, ret, "starting kthread"); break; } + + closure_get(&cl); + percpu_ref_get(&ca->io_ref); + wake_up_process(t); } err: closure_sync(&cl); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index c4f524b2ca9a..7d7e52ddde02 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -164,6 +164,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + kmsan_check_memory(insert, bkey_bytes(&insert->k)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) @@ -336,6 +337,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->cached != path->cached); BUG_ON(i->level != path->level); BUG_ON(i->btree_id != path->btree_id); + BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && !(i->flags & BTREE_UPDATE_internal_snapshot_node) && @@ -517,69 +519,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } } -static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - unsigned *btree_id_updates_start) +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - bool trans_trigger_run; + unsigned sort_id_start = 0; - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; + while (sort_id_start < trans->nr_updates) { + unsigned i, sort_id = trans->updates[sort_id_start].sort_order; + bool trans_trigger_run; - for (unsigned i = *btree_id_updates_start; - i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; - i++) { - if (trans->updates[i].btree_id < btree_id) { - *btree_id_updates_start = i; - continue; + /* + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being + * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop + * references before they are re-added. + * + * Running triggers will append more updates to the list of + * updates as we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = sort_id_start; + i < trans->nr_updates && trans->updates[i].sort_order <= sort_id; + i++) { + if (trans->updates[i].sort_order < sort_id) { + sort_id_start = i; + continue; + } + + int ret = run_one_trans_trigger(trans, trans->updates + i); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; } + } while (trans_trigger_run); - int ret = run_one_trans_trigger(trans, trans->updates + i); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && - i->btree_id == btree_id && - btree_node_type_has_trans_triggers(i->bkey_type) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); - - return 0; -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - unsigned btree_id = 0, btree_id_updates_start = 0; - int ret = 0; - - /* - * - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being moved - * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before - * they are re-added. - */ - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { - if (btree_id == BTREE_ID_alloc) - continue; - - ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); - if (ret) - return ret; + sort_id_start = i; } - btree_id_updates_start = 0; - ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); - if (ret) - return ret; - #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && @@ -903,18 +881,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, struct bch_fs *c = trans->c; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - switch (ret) { - case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, - trace_ip, trans->paths + i->path); - break; - case -BCH_ERR_btree_insert_need_mark_replicas: - ret = drop_locks_do(trans, - bch2_accounting_update_sb(trans)); - break; - case -BCH_ERR_journal_res_get_blocked: + if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) { /* * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK * flag @@ -922,13 +889,26 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; - break; + goto out; } ret = drop_locks_do(trans, bch2_trans_journal_res_get(trans, (flags & BCH_WATERMARK_MASK)| JOURNAL_RES_GET_CHECK)); + goto out; + } + + switch (ret) { + case -BCH_ERR_btree_insert_btree_node_full: + ret = bch2_btree_split_leaf(trans, i->path, flags); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_and_count(c, trans_restart_btree_node_split, trans, + trace_ip, trans->paths + i->path); + break; + case -BCH_ERR_btree_insert_need_mark_replicas: + ret = drop_locks_do(trans, + bch2_accounting_update_sb(trans)); break; case -BCH_ERR_btree_insert_need_journal_reclaim: bch2_trans_unlock(trans); @@ -950,7 +930,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, BUG_ON(ret >= 0); break; } - +out: BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a09cbe9cd94f..77578da2d23f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -423,6 +423,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) struct btree_insert_entry { unsigned flags; + u8 sort_order; u8 bkey_type; enum btree_id btree_id:8; u8 level:4; @@ -853,6 +854,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree) return BIT_ULL(btree) & mask; } +static inline u8 btree_trigger_order(enum btree_id btree) +{ + switch (btree) { + case BTREE_ID_alloc: + return U8_MAX; + case BTREE_ID_stripes: + return U8_MAX - 1; + default: + return btree; + } +} + struct btree_root { struct btree *b; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 13d794f201a5..bd2eb42edb24 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -17,7 +17,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { - return cmp_int(l->btree_id, r->btree_id) ?: + return cmp_int(l->sort_order, r->sort_order) ?: cmp_int(l->cached, r->cached) ?: -cmp_int(l->level, r->level) ?: bpos_cmp(l->k->k.p, r->k->k.p); @@ -397,6 +397,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, n = (struct btree_insert_entry) { .flags = flags, + .sort_order = btree_trigger_order(path->btree_id), .bkey_type = __btree_node_type(path->level, path->btree_id), .btree_id = path->btree_id, .level = path->level, @@ -511,6 +512,8 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 47d8690f01bf..d2e1c04353f6 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -133,6 +133,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr enum btree_id btree, struct bkey_i *k) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + if (unlikely(!btree_type_uses_write_buffer(btree))) { int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); dump_stack(); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index e4e7c804625e..67f1e3202835 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -649,6 +649,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, return 0; } +/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ +static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) +{ + struct btree_node *b_data = READ_ONCE(b->data); + + return (b_data ? b_data->keys.seq : 0) == seq; +} + static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; @@ -677,17 +685,9 @@ static void btree_update_nodes_written(struct btree_update *as) * on disk: */ for (i = 0; i < as->nr_old_nodes; i++) { - __le64 seq; - b = as->old_nodes[i]; - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - seq = b->data ? b->data->keys.seq : 0; - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq == as->old_nodes_seq[i]) + if (btree_node_seq_matches(b, as->old_nodes_seq[i])) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -2126,6 +2126,31 @@ err_free_update: goto out; } +static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b) +{ + bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_intent); + int ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + /* has node been freed? */ + if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + ret = -BCH_ERR_btree_node_dying; + goto err; + } + + BUG_ON(!btree_node_hashed(b)); + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, @@ -2191,66 +2216,78 @@ err: goto out; } -struct async_btree_rewrite { - struct bch_fs *c; - struct work_struct work; - struct list_head list; - enum btree_id btree_id; - unsigned level; - struct bkey_buf key; -}; - -static int async_btree_node_rewrite_trans(struct btree_trans *trans, - struct async_btree_rewrite *a) +static int bch2_btree_node_rewrite_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_i *k, unsigned flags) { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, - a->btree_id, a->key.k->k.p, - BTREE_MAX_DEPTH, a->level, 0); + btree, k->k.p, + BTREE_MAX_DEPTH, level, 0); struct btree *b = bch2_btree_iter_peek_node(&iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto out; - bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); + bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, 0) + ? bch2_btree_node_rewrite(trans, &iter, b, flags) : -ENOENT; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} -#if 0 - /* Tracepoint... */ - if (!ret || ret == -ENOENT) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; +int bch2_btree_node_rewrite_pos(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bpos pos, unsigned flags) +{ + BUG_ON(!level); - if (!ret) { - prt_printf(&buf, "rewrite node:\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - } else { - prt_printf(&buf, "node to rewrite not found:\n want: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - prt_printf(&buf, "\n got: "); - if (b) - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - else - prt_str(&buf, "(null)"); - } - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -#endif -out: + /* Traverse one depth lower to get a pointer to the node itself: */ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + ret = bch2_btree_node_rewrite(trans, &iter, b, flags); +err: bch2_trans_iter_exit(trans, &iter); return ret; } +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, + struct btree *b, unsigned flags) +{ + struct btree_iter iter; + int ret = get_iter_to_node(trans, &iter, b); + if (ret) + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; + + ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +struct async_btree_rewrite { + struct bch_fs *c; + struct work_struct work; + struct list_head list; + enum btree_id btree_id; + unsigned level; + struct bkey_buf key; +}; + static void async_btree_node_rewrite_work(struct work_struct *work) { struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); + int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, + a->btree_id, a->level, a->key.k, 0)); if (ret != -ENOENT) bch_err_fn_ratelimited(c, ret); @@ -2494,30 +2531,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, unsigned commit_flags, bool skip_triggers) { struct btree_iter iter; - int ret; - - bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter); + int ret = get_iter_to_node(trans, &iter, b); if (ret) - goto out; - - /* has node been freed? */ - if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { - /* node has been freed: */ - BUG_ON(!btree_node_dying(b)); - goto out; - } - - BUG_ON(!btree_node_hashed(b)); + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); ret = bch2_btree_node_update_key(trans, &iter, b, new_key, commit_flags, skip_triggers); -out: bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 26d646e1275c..be71cd73b864 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -169,7 +169,14 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, struct btree *, unsigned); +int bch2_btree_node_rewrite_pos(struct btree_trans *, + enum btree_id, unsigned, + struct bpos, unsigned); +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, + struct btree *, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); + int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, struct btree *, struct bkey_i *, unsigned, bool); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 345b117a4a4a..e56ef623ebc1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -590,11 +590,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (ret) goto err; - if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); - if (ret) - goto err; - } + ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); + if (ret) + goto err; } if (flags & BTREE_TRIGGER_gc) { @@ -674,10 +672,10 @@ err: return -BCH_ERR_ENOMEM_mark_stripe_ptr; } - mutex_lock(&c->ec_stripes_heap_lock); + gc_stripe_lock(m); if (!m || !m->alive) { - mutex_unlock(&c->ec_stripes_heap_lock); + gc_stripe_unlock(m); struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s", @@ -693,7 +691,7 @@ err: .type = BCH_DISK_ACCOUNTING_replicas, }; memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); - mutex_unlock(&c->ec_stripes_heap_lock); + gc_stripe_unlock(m); acc.replicas.data_type = data_type; int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); @@ -726,9 +724,7 @@ static int __trigger_extent(struct btree_trans *trans, .replicas.nr_required = 1, }; - struct disk_accounting_pos acct_compression_key = { - .type = BCH_DISK_ACCOUNTING_compression, - }; + unsigned cur_compression_type = 0; u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -762,13 +758,13 @@ static int __trigger_extent(struct btree_trans *trans, acc_replicas_key.replicas.nr_required = 0; } - if (acct_compression_key.compression.type && - acct_compression_key.compression.type != p.crc.compression_type) { + if (cur_compression_type && + cur_compression_type != p.crc.compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; @@ -777,7 +773,7 @@ static int __trigger_extent(struct btree_trans *trans, compression_acct[2] = 0; } - acct_compression_key.compression.type = p.crc.compression_type; + cur_compression_type = p.crc.compression_type; if (p.crc.compression_type) { compression_acct[1] += p.crc.uncompressed_size; compression_acct[2] += p.crc.compressed_size; @@ -791,45 +787,34 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - struct disk_accounting_pos acc_snapshot_key = { - .type = BCH_DISK_ACCOUNTING_snapshot, - .snapshot.id = k.k->p.snapshot, - }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } - if (acct_compression_key.compression.type) { + if (cur_compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; } if (level) { - struct disk_accounting_pos acc_btree_key = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = btree_id, - }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct disk_accounting_pos acc_inum_key = { - .type = BCH_DISK_ACCOUNTING_inum, - .inum.inum = k.k->p.inode, - }; + s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), *replicas_sectors, }; - ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); + ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) return ret; } @@ -878,15 +863,15 @@ int bch2_trigger_extent(struct btree_trans *trans, } int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta = 0; + s64 need_rebalance_sectors_delta[1] = { 0 }; s64 s = bch2_bkey_sectors_need_rebalance(c, old); need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta -= s; + need_rebalance_sectors_delta[0] -= s; s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; - need_rebalance_sectors_delta += s; + need_rebalance_sectors_delta[0] += s; if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, @@ -895,12 +880,9 @@ int bch2_trigger_extent(struct btree_trans *trans, return ret; } - if (need_rebalance_sectors_delta) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, - }; - int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, - flags & BTREE_TRIGGER_gc); + if (need_rebalance_sectors_delta[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + need_rebalance_sectors_delta, rebalance_work); if (ret) return ret; } @@ -916,17 +898,13 @@ static int __trigger_reservation(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors = k.k->size; + s64 sectors[1] = { k.k->size }; if (flags & BTREE_TRIGGER_overwrite) - sectors = -sectors; - - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_persistent_reserved, - .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, - }; + sectors[0] = -sectors[0]; - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, + persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); } return 0; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index a9acdd6c0c86..c5363256e363 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -39,33 +39,6 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -/* - * Ugly hack alert: - * - * We need to cram a spinlock in a single byte, because that's what we have left - * in struct bucket, and we care about the size of these - during fsck, we need - * in memory state for every single bucket on every device. - * - * We used to do - * while (xchg(&b->lock, 1) cpu_relax(); - * but, it turns out not all architectures support xchg on a single byte. - * - * So now we use bit_spin_lock(), with fun games since we can't burn a whole - * ulong for this - we just need to make sure the lock bit always ends up in the - * first byte. - */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define BUCKET_LOCK_BITNR 0 -#else -#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) -#endif - -union ulong_byte_assert { - ulong ulong; - u8 byte; -}; - static inline void bucket_unlock(struct bucket *b) { BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); @@ -167,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b) static inline int gen_after(u8 a, u8 b) { - int r = gen_cmp(a, b); - - return r > 0 ? r : 0; + return max(0, gen_cmp(a, b)); } static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 7174047b8e92..900b8680c8b5 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -7,6 +7,33 @@ #define BUCKET_JOURNAL_SEQ_BITS 16 +/* + * Ugly hack alert: + * + * We need to cram a spinlock in a single byte, because that's what we have left + * in struct bucket, and we care about the size of these - during fsck, we need + * in memory state for every single bucket on every device. + * + * We used to do + * while (xchg(&b->lock, 1) cpu_relax(); + * but, it turns out not all architectures support xchg on a single byte. + * + * So now we use bit_spin_lock(), with fun games since we can't burn a whole + * ulong for this - we just need to make sure the lock bit always ends up in the + * first byte. + */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define BUCKET_LOCK_BITNR 0 +#else +#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) +#endif + +union ulong_byte_assert { + ulong ulong; + u8 byte; +}; + struct bucket { u8 lock; u8 gen_valid:1; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 46e9e32105a9..57d55b3ddc71 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -11,6 +11,7 @@ #include "move.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-counters.h" #include "super-io.h" #include "thread_with_file.h" @@ -312,7 +313,12 @@ static int bch2_data_thread(void *arg) struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - ctx->stats.data_type = U8_MAX; + if (ctx->thr.ret == -BCH_ERR_device_offline) + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; + else { + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; + ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; + } return 0; } @@ -331,14 +337,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { - .type = BCH_DATA_EVENT_PROGRESS, - .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.pos.btree, - .p.pos = ctx->stats.pos.pos, - .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_total = bch2_fs_usage_read_short(c).used, + .type = BCH_DATA_EVENT_PROGRESS, + .ret = ctx->stats.ret, + .p.data_type = ctx->stats.data_type, + .p.btree_id = ctx->stats.pos.btree, + .p.pos = ctx->stats.pos.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), + .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), }; + if (ctx->arg.op == BCH_DATA_OP_scrub) { + struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); + if (ca) { + struct bch_dev_usage u; + bch2_dev_usage_read_fast(ca, &u); + for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) + if (ctx->arg.scrub.data_types & BIT(i)) + e.p.sectors_total += u.d[i].sectors; + bch2_dev_put(ca); + } + } else { + e.p.sectors_total = bch2_fs_usage_read_short(c).used; + } + if (len < sizeof(e)) return -EINVAL; @@ -710,6 +732,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); case BCH_IOCTL_QUERY_ACCOUNTING: return bch2_ioctl_query_accounting(c, arg); + case BCH_IOCTL_QUERY_COUNTERS: + return bch2_ioctl_query_counters(c, arg); default: return -ENOTTY; } diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 23a383577d4c..3726689093e3 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -466,7 +466,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_recompute_checksum; } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { @@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c) return 0; } +#if 0 + +/* + * This seems to be duplicating code in cmd_remove_passphrase() in + * bcachefs-tools, but we might want to switch userspace to use this - and + * perhaps add an ioctl for calling this at runtime, so we can take the + * passphrase off of a mounted filesystem (which has come up). + */ int bch2_disable_encryption(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; @@ -725,6 +733,10 @@ out: return ret; } +/* + * For enabling encryption on an existing filesystem: not hooked up yet, but it + * should be + */ int bch2_enable_encryption(struct bch_fs *c, bool keyed) { struct bch_encrypted_key key; @@ -781,6 +793,7 @@ err: memzero_explicit(&key, sizeof(key)); return ret; } +#endif void bch2_fs_encryption_exit(struct bch_fs *c) { @@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c) crypto_free_shash(c->poly1305); if (c->chacha20) crypto_free_sync_skcipher(c->chacha20); - if (c->sha256) - crypto_free_shash(c->sha256); } int bch2_fs_encryption_init(struct bch_fs *c) @@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c) struct bch_key key; int ret = 0; - c->sha256 = crypto_alloc_shash("sha256", 0, 0); - ret = PTR_ERR_OR_ZERO(c->sha256); - if (ret) { - c->sha256 = NULL; - bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); - goto out; - } - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 43b9d71f2f2b..4ac251c8fcd8 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); +#if 0 int bch2_disable_encryption(struct bch_fs *); int bch2_enable_encryption(struct bch_fs *, bool); +#endif void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 114bf2f3879f..85fc90342492 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t src_len = src->bi_iter.bi_size; size_t dst_len = crc.uncompressed_size << 9; void *workspace; - int ret; + int ret = 0, ret2; enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); mempool_t *workspace_pool = &c->compress_workspace[opt]; @@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, else ret = -BCH_ERR_compression_workspace_not_initialized; if (ret) - goto out; + goto err; } src_data = bio_map_or_bounce(c, src, READ); @@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, switch (crc.compression_type) { case BCH_COMPRESSION_TYPE_lz4_old: case BCH_COMPRESSION_TYPE_lz4: - ret = LZ4_decompress_safe_partial(src_data.b, dst_data, - src_len, dst_len, dst_len); - if (ret != dst_len) - goto err; + ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_lz4; break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); - ret = zlib_inflate(&strm, Z_FINISH); + ret2 = zlib_inflate(&strm, Z_FINISH); mempool_free(workspace, workspace_pool); - if (ret != Z_STREAM_END) - goto err; + if (ret2 != Z_STREAM_END) + ret = -BCH_ERR_decompress_gzip; break; } case BCH_COMPRESSION_TYPE_zstd: { ZSTD_DCtx *ctx; size_t real_src_len = le32_to_cpup(src_data.b); - if (real_src_len > src_len - 4) + if (real_src_len > src_len - 4) { + ret = -BCH_ERR_decompress_zstd_src_len_bad; goto err; + } workspace = mempool_alloc(workspace_pool, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - ret = zstd_decompress_dctx(ctx, + ret2 = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); mempool_free(workspace, workspace_pool); - if (ret != dst_len) - goto err; + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_zstd; break; } default: BUG(); } - ret = 0; +err: fsck_err: -out: bio_unmap_or_unbounce(c, src_data); return ret; -err: - ret = -EIO; - goto out; } int bch2_bio_uncompress_inplace(struct bch_write_op *op, @@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, BUG_ON(!bio->bi_vcnt); BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || - crc->compressed_size << 9 > c->opts.encoded_extent_max) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: extent too big"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - return -EIO; + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { + bch2_write_op_error(op, op->pos.offset, + "extent too big to decompress (%u > %u)", + crc->uncompressed_size << 9, c->opts.encoded_extent_max); + return -BCH_ERR_decompress_exceeded_max_encoded_extent; } data = __bounce_alloc(c, dst_len, WRITE); - if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: decompression error"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - ret = -EIO; + ret = __bio_uncompress(c, bio, data.b, *crc); + + if (c->opts.no_data_io) + ret = 0; + + if (ret) { + bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); goto err; } @@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -EIO; + return -BCH_ERR_decompress_exceeded_max_encoded_extent; dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 642fbc60ecab..0ec273daccb7 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -20,6 +20,8 @@ #include "subvolume.h" #include "trace.h" +#include <linux/ioprio.h> + static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -33,7 +35,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - if (!bch2_dev_tryget(c, ptr->dev)) { + if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { bkey_for_each_ptr(ptrs, ptr2) { if (ptr2 == ptr) break; @@ -91,7 +93,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc return true; } -static noinline void trace_move_extent_finish2(struct data_update *u, +static noinline void trace_io_move_finish2(struct data_update *u, struct bkey_i *new, struct bkey_i *insert) { @@ -111,11 +113,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); prt_newline(&buf); - trace_move_extent_finish(c, buf.buf); + trace_io_move_finish(c, buf.buf); printbuf_exit(&buf); } -static void trace_move_extent_fail2(struct data_update *m, +static void trace_io_move_fail2(struct data_update *m, struct bkey_s_c new, struct bkey_s_c wrote, struct bkey_i *insert, @@ -126,7 +128,7 @@ static void trace_move_extent_fail2(struct data_update *m, struct printbuf buf = PRINTBUF; unsigned rewrites_found = 0; - if (!trace_move_extent_fail_enabled()) + if (!trace_io_move_fail_enabled()) return; prt_str(&buf, msg); @@ -166,7 +168,7 @@ static void trace_move_extent_fail2(struct data_update *m, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); } - trace_move_extent_fail(c, buf.buf); + trace_io_move_fail(c, buf.buf); printbuf_exit(&buf); } @@ -214,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, new = bkey_i_to_extent(bch2_keylist_front(keys)); if (!bch2_extents_match(k, old)) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), NULL, "no match:"); goto nowork; } @@ -254,7 +256,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (m->data_opts.rewrite_ptrs && !rewrites_found && bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); goto nowork; } @@ -271,7 +273,7 @@ restart_drop_conflicting_replicas: } if (!bkey_val_u64s(&new->k)) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); goto nowork; } @@ -352,7 +354,7 @@ restart_drop_extra_replicas: printbuf_exit(&buf); bch2_fatal_error(c); - ret = -EIO; + ret = -BCH_ERR_invalid_bkey; goto out; } @@ -385,9 +387,9 @@ restart_drop_extra_replicas: if (!ret) { bch2_btree_iter_set_pos(&iter, next_pos); - this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); - if (trace_move_extent_finish_enabled()) - trace_move_extent_finish2(m, &new->k_i, insert); + this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); + if (trace_io_move_finish_enabled()) + trace_io_move_finish2(m, &new->k_i, insert); } err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -409,7 +411,7 @@ nowork: &m->stats->sectors_raced); } - count_event(c, move_extent_fail); + count_event(c, io_move_fail); bch2_btree_iter_advance(&iter); goto next; @@ -427,14 +429,17 @@ int bch2_data_update_index_update(struct bch_write_op *op) return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); } -void bch2_data_update_read_done(struct data_update *m, - struct bch_extent_crc_unpacked crc) +void bch2_data_update_read_done(struct data_update *m) { + m->read_done = true; + /* write bio must own pages: */ BUG_ON(!m->op.wbio.bio.bi_vcnt); - m->op.crc = crc; - m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + m->op.crc = m->rbio.pick.crc; + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; + + this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); closure_call(&m->op.cl, bch2_write, NULL, NULL); } @@ -444,31 +449,34 @@ void bch2_data_update_exit(struct data_update *update) struct bch_fs *c = update->op.c; struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + bch2_bio_free_pages_pool(c, &update->op.wbio.bio); + kfree(update->bvecs); + update->bvecs = NULL; + if (c->opts.nocow_enabled) bkey_nocow_unlock(c, k); bkey_put_dev_refs(c, k); - bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); - bch2_bio_free_pages_pool(c, &update->op.wbio.bio); + bch2_bkey_buf_exit(&update->k, c); } -static void bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) +static int bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) { struct bch_fs *c = update->op.c; - struct bio *bio = &update->op.wbio.bio; struct bkey_i_extent *e; struct write_point *wp; struct closure cl; struct btree_iter iter; struct bkey_s_c k; - int ret; + int ret = 0; closure_init_stack(&cl); bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - while (bio_sectors(bio)) { - unsigned sectors = bio_sectors(bio); + while (bpos_lt(update->op.pos, update->k.k->k.p)) { + unsigned sectors = update->k.k->k.p.offset - + update->op.pos.offset; bch2_trans_begin(trans); @@ -504,7 +512,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch_err_fn_ratelimited(c, ret); if (ret) - return; + break; sectors = min(sectors, wp->sectors_free); @@ -514,7 +522,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); bch2_alloc_sectors_done(c, wp); - bio_advance(bio, sectors << 9); update->op.pos.offset += sectors; extent_for_each_ptr(extent_i_to_s(e), ptr) @@ -533,13 +540,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_trans_unlock(trans); closure_sync(&cl); } + + return ret; } void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - printbuf_tabstop_push(out, 20); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); prt_str_indented(out, "rewrite ptrs:\t"); bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); @@ -574,6 +584,17 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); } +void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); + printbuf_indent_add(out, 2); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); + prt_printf(out, "read_done:\t\%u\n", m->read_done); + bch2_write_op_to_text(out, &m->op); + printbuf_indent_sub(out, 2); +} + int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -617,12 +638,85 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } +int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); + + m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); + if (!m->bvecs) + return -ENOMEM; + + bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); + bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); + + if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { + kfree(m->bvecs); + m->bvecs = NULL; + return -ENOMEM; + } + + rbio_init(&m->rbio.bio, c, *io_opts, NULL); + m->rbio.data_update = true; + m->rbio.bio.bi_iter.bi_size = buf_bytes; + m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); + m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + return 0; +} + +static int can_write_extent(struct bch_fs *c, struct data_update *m) +{ + if ((m->op.flags & BCH_WRITE_alloc_nowait) && + unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) + return -BCH_ERR_data_update_done_would_block; + + unsigned target = m->op.flags & BCH_WRITE_only_specified_devs + ? m->op.target + : 0; + struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); + + darray_for_each(m->op.devs_have, i) + __clear_bit(*i, devs.d); + + rcu_read_lock(); + unsigned nr_replicas = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { + struct bch_dev *ca = bch2_dev_rcu(c, i); + + struct bch_dev_usage usage; + bch2_dev_usage_read_fast(ca, &usage); + + if (!dev_buckets_free(ca, usage, m->op.watermark)) + continue; + + nr_replicas += ca->mi.durability; + if (nr_replicas >= m->op.nr_replicas) + break; + } + rcu_read_unlock(); + + if (!nr_replicas) + return -BCH_ERR_data_update_done_no_rw_devs; + if (nr_replicas < m->op.nr_replicas) + return -BCH_ERR_insufficient_devices; + return 0; +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, struct data_update *m, struct write_point_specifier wp, - struct bch_io_opts io_opts, + struct bch_io_opts *io_opts, struct data_update_opts data_opts, enum btree_id btree_id, struct bkey_s_c k) @@ -640,16 +734,7 @@ int bch2_data_update_init(struct btree_trans *trans, * snapshots table - just skip it, we can move it later. */ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done; - - if (!bkey_get_dev_refs(c, k)) - return -BCH_ERR_data_update_done; - - if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, k)) { - bkey_put_dev_refs(c, k); - return -BCH_ERR_nocow_lock_blocked; - } + return -BCH_ERR_data_update_done_no_snapshot; bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); @@ -658,18 +743,18 @@ int bch2_data_update_init(struct btree_trans *trans, m->ctxt = ctxt; m->stats = ctxt ? ctxt->stats : NULL; - bch2_write_op_init(&m->op, c, io_opts); + bch2_write_op_init(&m->op, c, *io_opts); m->op.pos = bkey_start_pos(k.k); m->op.version = k.k->bversion; m->op.target = data_opts.target; m->op.write_point = wp; m->op.nr_replicas = 0; - m->op.flags |= BCH_WRITE_PAGES_STABLE| - BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_MOVE| + m->op.flags |= BCH_WRITE_pages_stable| + BCH_WRITE_pages_owned| + BCH_WRITE_data_encoded| + BCH_WRITE_move| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression; + m->op.compression_opt = io_opts->background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; unsigned durability_have = 0, durability_removing = 0; @@ -707,7 +792,7 @@ int bch2_data_update_init(struct btree_trans *trans, ptr_bit <<= 1; } - unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); /* * If current extent durability is less than io_opts.data_replicas, @@ -740,28 +825,70 @@ int bch2_data_update_init(struct btree_trans *trans, m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); - goto out; + ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); + if (!ret) + ret = -BCH_ERR_data_update_done_no_writes_needed; + goto out_bkey_buf_exit; } + /* + * Check if the allocation will succeed, to avoid getting an error later + * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless + * read: + * + * This guards against + * - BCH_WRITE_alloc_nowait allocations failing (promotes) + * - Destination target full + * - Device(s) in destination target offline + * - Insufficient durability available in destination target + * (i.e. trying to move a durability=2 replica to a target with a + * single durability=2 device) + */ + ret = can_write_extent(c, m); + if (ret) + goto out_bkey_buf_exit; + if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - goto out; + goto out_bkey_buf_exit; + } + + if (!bkey_get_dev_refs(c, k)) { + ret = -BCH_ERR_data_update_done_no_dev_refs; + goto out_put_disk_res; + } + + if (c->opts.nocow_enabled && + !bkey_nocow_lock(c, ctxt, k)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto out_put_dev_refs; } if (bkey_extent_is_unwritten(k)) { - bch2_update_unwritten_extent(trans, m); - goto out; + ret = bch2_update_unwritten_extent(trans, m) ?: + -BCH_ERR_data_update_done_unwritten; + goto out_nocow_unlock; } + ret = bch2_data_update_bios_init(m, c, io_opts); + if (ret) + goto out_nocow_unlock; + return 0; -out: - bch2_data_update_exit(m); - return ret ?: -BCH_ERR_data_update_done; +out_nocow_unlock: + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); +out_put_dev_refs: + bkey_put_dev_refs(c, k); +out_put_disk_res: + bch2_disk_reservation_put(c, &m->op.res); +out_bkey_buf_exit: + bch2_bkey_buf_exit(&m->k, c); + return ret; } void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index e4b50723428e..c194cbbf5b51 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -4,6 +4,7 @@ #define _BCACHEFS_DATA_UPDATE_H #include "bkey_buf.h" +#include "io_read.h" #include "io_write_types.h" struct moving_context; @@ -15,6 +16,9 @@ struct data_update_opts { u8 extra_replicas; unsigned btree_insert_flags; unsigned write_flags; + + int read_dev; + bool scrub; }; void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, @@ -22,20 +26,24 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, struct data_update { /* extent being updated: */ + bool read_done; enum btree_id btree_id; struct bkey_buf k; struct data_update_opts data_opts; struct moving_context *ctxt; struct bch_move_stats *stats; + + struct bch_read_bio rbio; struct bch_write_op op; + struct bio_vec *bvecs; }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); +void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); int bch2_data_update_index_update(struct bch_write_op *); -void bch2_data_update_read_done(struct data_update *, - struct bch_extent_crc_unpacked); +void bch2_data_update_read_done(struct data_update *); int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, @@ -43,12 +51,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *, struct bch_io_opts *, struct data_update_opts *); +int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, + struct bch_io_opts *); + void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, struct moving_context *, struct data_update *, struct write_point_specifier, - struct bch_io_opts, struct data_update_opts, + struct bch_io_opts *, struct data_update_opts, enum btree_id, struct bkey_s_c); void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 55333e82d1fe..788af88f6979 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -7,6 +7,7 @@ */ #include "bcachefs.h" +#include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_io.h" @@ -190,7 +191,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, unsigned offset = 0; int ret; - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { prt_printf(out, "error getting device to read from: invalid device\n"); return; } @@ -844,8 +845,11 @@ restart: seqmutex_unlock(&c->btree_trans_lock); } -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); + +static ssize_t bch2_simple_print(struct file *file, char __user *buf, + size_t size, loff_t *ppos, + fs_to_text_fn fn) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; @@ -856,7 +860,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, i->ret = 0; if (!i->iter) { - btree_deadlock_to_text(&i->buf, c); + fn(&i->buf, c); i->iter++; } @@ -869,6 +873,12 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, return ret ?: i->ret; } +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); +} + static const struct file_operations btree_deadlock_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, @@ -876,6 +886,19 @@ static const struct file_operations btree_deadlock_ops = { .read = bch2_btree_deadlock_read, }; +static ssize_t bch2_write_points_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); +} + +static const struct file_operations write_points_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_write_points_read, +}; + void bch2_fs_debug_exit(struct bch_fs *c) { if (!IS_ERR_OR_NULL(c->fs_debug_dir)) @@ -927,6 +950,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, c->btree_debug, &btree_deadlock_ops); + debugfs_create_file("write_points", 0400, c->fs_debug_dir, + c->btree_debug, &write_points_ops); + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 600eee936f13..d7f9f79318a2 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -13,6 +13,40 @@ #include <linux/dcache.h> +static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + *out_cf = (struct qstr) QSTR_INIT(NULL, 0); + +#ifdef CONFIG_UNICODE + unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); + int ret = PTR_ERR_OR_ZERO(buf); + if (ret) + return ret; + + ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1); + if (ret <= 0) + return ret; + + *out_cf = (struct qstr) QSTR_INIT(buf, ret); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static inline int bch2_maybe_casefold(struct btree_trans *trans, + const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + if (likely(!info->cf_encoding)) { + *out_cf = *str; + return 0; + } else { + return bch2_casefold(trans, info, str, out_cf); + } +} + static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) @@ -28,13 +62,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) #endif return bkey_bytes - - offsetof(struct bch_dirent, d_name) - + (d.v->d_casefold + ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + : offsetof(struct bch_dirent, d_name)) - trailing_nuls; } struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) { - return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + if (d.v->d_casefold) { + unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); + return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len); + } else { + return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + } +} + +static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d) +{ + if (d.v->d_casefold) { + unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); + unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len); + return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len); + } else { + return (struct qstr) QSTR_INIT(NULL, 0); + } +} + +static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d) +{ + return d.v->d_casefold + ? bch2_dirent_get_casefold_name(d) + : bch2_dirent_get_name(d); } static u64 bch2_dirent_hash(const struct bch_hash_info *info, @@ -57,7 +116,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_name(d); + struct qstr name = bch2_dirent_get_lookup_name(d); return bch2_dirent_hash(info, &name); } @@ -65,7 +124,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - const struct qstr l_name = bch2_dirent_get_name(l); + const struct qstr l_name = bch2_dirent_get_lookup_name(l); const struct qstr *r_name = _r; return !qstr_eq(l_name, *r_name); @@ -75,8 +134,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); - const struct qstr l_name = bch2_dirent_get_name(l); - const struct qstr r_name = bch2_dirent_get_name(r); + const struct qstr l_name = bch2_dirent_get_lookup_name(l); + const struct qstr r_name = bch2_dirent_get_lookup_name(r); return !qstr_eq(l_name, r_name); } @@ -104,17 +163,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + unsigned name_block_len = bch2_dirent_name_bytes(d); struct qstr d_name = bch2_dirent_get_name(d); + struct qstr d_cf_name = bch2_dirent_get_casefold_name(d); int ret = 0; bkey_fsck_err_on(!d_name.len, c, dirent_empty_name, "empty name"); - bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), + bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len, c, dirent_val_too_big, - "value too big (%zu > %u)", - bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); + "dirent names exceed bkey size (%d + %d > %d)", + d_name.len, d_cf_name.len, name_block_len); /* * Check new keys don't exceed the max length @@ -142,6 +203,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, dirent_to_itself, "dirent points to own directory"); + + if (d.v->d_casefold) { + bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit && + d_cf_name.len > BCH_NAME_MAX, + c, dirent_cf_name_too_big, + "dirent w/ cf name too big (%u > %u)", + d_cf_name.len, BCH_NAME_MAX); + + bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len), + c, dirent_stray_data_after_cf_name, + "dirent has stray data after cf name's NUL"); + } fsck_err: return ret; } @@ -163,15 +236,14 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } -static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, - subvol_inum dir, u8 type, - const struct qstr *name, u64 dst) +static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, + subvol_inum dir, + u8 type, + int name_len, int cf_name_len, + u64 dst) { struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); - - if (name->len > BCH_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); BUG_ON(u64s > U8_MAX); @@ -190,14 +262,65 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, } dirent->v.d_type = type; + dirent->v.d_unused = 0; + dirent->v.d_casefold = cf_name_len ? 1 : 0; - memcpy(dirent->v.d_name, name->name, name->len); - memset(dirent->v.d_name + name->len, 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); + return dirent; +} - EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); +static void dirent_init_regular_name(struct bkey_i_dirent *dirent, + const struct qstr *name) +{ + EBUG_ON(dirent->v.d_casefold); + + memcpy(&dirent->v.d_name[0], name->name, name->len); + memset(&dirent->v.d_name[name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); +} + +static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, + const struct qstr *name, + const struct qstr *cf_name) +{ + EBUG_ON(!dirent->v.d_casefold); + EBUG_ON(!cf_name->len); + + dirent->v.d_cf_name_block.d_name_len = name->len; + dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len; + memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); + memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); + memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_cf_name_block.d_names) - + name->len + cf_name->len); + + EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); +} + +static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + subvol_inum dir, + u8 type, + const struct qstr *name, + const struct qstr *cf_name, + u64 dst) +{ + struct bkey_i_dirent *dirent; + + if (name->len > BCH_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); + if (IS_ERR(dirent)) + return dirent; + + if (cf_name) + dirent_init_casefolded_name(dirent, name, cf_name); + else + dirent_init_regular_name(dirent, name); + + EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); return dirent; } @@ -213,7 +336,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); + dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -233,16 +356,28 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, + u64 *i_size, enum btree_iter_update_trigger_flags flags) { struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, dir, type, name, dst_inum); + if (hash_info->cf_encoding) { + struct qstr cf_name; + ret = bch2_casefold(trans, hash_info, name, &cf_name); + if (ret) + return ret; + dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum); + } else { + dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum); + } + ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; + *i_size += bkey_bytes(&dirent->k); + ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; @@ -275,12 +410,13 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, + subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) { + struct qstr src_name_lookup, dst_name_lookup; struct btree_iter src_iter = { NULL }; struct btree_iter dst_iter = { NULL }; struct bkey_s_c old_src, old_dst = bkey_s_c_null; @@ -295,8 +431,11 @@ int bch2_dirent_rename(struct btree_trans *trans, memset(dst_inum, 0, sizeof(*dst_inum)); /* Lookup src: */ + ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup); + if (ret) + goto out; old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, + src_hash, src_dir, &src_name_lookup, BTREE_ITER_intent); ret = bkey_err(old_src); if (ret) @@ -308,6 +447,9 @@ int bch2_dirent_rename(struct btree_trans *trans, goto out; /* Lookup dst: */ + ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup); + if (ret) + goto out; if (mode == BCH_RENAME) { /* * Note that we're _not_ checking if the target already exists - @@ -315,12 +457,12 @@ int bch2_dirent_rename(struct btree_trans *trans, * correctness: */ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name); + dst_hash, dst_dir, &dst_name_lookup); if (ret) goto out; } else { old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, + dst_hash, dst_dir, &dst_name_lookup, BTREE_ITER_intent); ret = bkey_err(old_dst); if (ret) @@ -336,7 +478,8 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); + new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, + dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; @@ -346,7 +489,8 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); + new_src = dirent_create_key(trans, src_dir, 0, src_name, + src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; @@ -406,6 +550,14 @@ int bch2_dirent_rename(struct btree_trans *trans, new_src->v.d_type == DT_SUBVOL) new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + if (old_dst.k) + *dst_dir_i_size -= bkey_bytes(old_dst.k); + *src_dir_i_size -= bkey_bytes(old_src.k); + + if (mode == BCH_RENAME_EXCHANGE) + *src_dir_i_size += bkey_bytes(&new_src->k); + *dst_dir_i_size += bkey_bytes(&new_dst->k); + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; @@ -465,9 +617,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, const struct qstr *name, subvol_inum *inum, unsigned flags) { + struct qstr lookup_name; + int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name); + if (ret) + return ret; + struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - int ret = bkey_err(k); + hash_info, dir, &lookup_name, flags); + ret = bkey_err(k); if (ret) goto err; @@ -572,3 +729,54 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) return ret < 0 ? ret : 0; } + +/* fsck */ + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; + } + ret = -BCH_ERR_ENOENT_inode; +found: + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + int ret; + + ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + goto err; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &iter); +err: + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 362b3b2f2f2e..0880772b80a9 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -25,10 +25,13 @@ struct bch_inode_info; struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); -static inline unsigned dirent_val_u64s(unsigned len) +static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) { - return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, - sizeof(u64)); + unsigned bytes = cf_len + ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len + : offsetof(struct bch_dirent, d_name) + len; + + return DIV_ROUND_UP(bytes, sizeof(u64)); } int bch2_dirent_read_target(struct btree_trans *, subvol_inum, @@ -47,7 +50,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, enum btree_iter_update_trigger_flags); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, + const struct qstr *, u64, u64 *, u64 *, enum btree_iter_update_trigger_flags); static inline unsigned vfs_d_type(unsigned type) @@ -62,8 +65,8 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, - subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, u64 *, + subvol_inum, struct bch_hash_info *, u64 *, const struct qstr *, subvol_inum *, u64 *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); @@ -79,4 +82,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); + #endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h index 5e116b88e814..a46dbddd21aa 100644 --- a/fs/bcachefs/dirent_format.h +++ b/fs/bcachefs/dirent_format.h @@ -29,9 +29,25 @@ struct bch_dirent { * Copy of mode bits 12-15 from the target inode - so userspace can get * the filetype without having to do a stat() */ - __u8 d_type; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 d_type:5, + d_unused:2, + d_casefold:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 d_casefold:1, + d_unused:2, + d_type:5; +#endif - __u8 d_name[]; + union { + struct { + __u8 d_pad; + __le16 d_name_len; + __le16 d_cf_name_len; + __u8 d_names[]; + } d_cf_name_block __packed; + __DECLARE_FLEX_ARRAY(__u8, d_name); + } __packed; } __packed __aligned(8); #define DT_SUBVOL 16 diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f4372cafea2e..f9214e2d1346 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -85,6 +85,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); + +#define disk_accounting_key_init(_k, _type, ...) \ +do { \ + memset(&(_k), 0, sizeof(_k)); \ + (_k).type = BCH_DISK_ACCOUNTING_##_type; \ + (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ +} while (0) + +#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ +({ \ + struct disk_accounting_pos pos; \ + disk_accounting_key_init(pos, __VA_ARGS__); \ + bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ +}) + +#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ + bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) + int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 7b6e6c97e6aa..15190196485f 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -113,14 +113,14 @@ enum disk_accounting_type { BCH_DISK_ACCOUNTING_TYPE_NR, }; -struct bch_nr_inodes { +struct bch_acct_nr_inodes { }; -struct bch_persistent_reserved { +struct bch_acct_persistent_reserved { __u8 nr_replicas; }; -struct bch_dev_data_type { +struct bch_acct_dev_data_type { __u8 dev; __u8 data_type; }; @@ -149,10 +149,10 @@ struct disk_accounting_pos { struct { __u8 type; union { - struct bch_nr_inodes nr_inodes; - struct bch_persistent_reserved persistent_reserved; + struct bch_acct_nr_inodes nr_inodes; + struct bch_acct_persistent_reserved persistent_reserved; struct bch_replicas_entry_v1 replicas; - struct bch_dev_data_type dev_data_type; + struct bch_acct_dev_data_type dev_data_type; struct bch_acct_compression compression; struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d2a5e76e6479..f2b9225fe0bc 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -20,6 +20,7 @@ #include "io_read.h" #include "io_write.h" #include "keylist.h" +#include "lru.h" #include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -104,6 +105,7 @@ struct ec_bio { struct bch_dev *ca; struct ec_stripe_buf *buf; size_t idx; + u64 submit_time; struct bio bio; }; @@ -298,10 +300,22 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bpos bucket = PTR_BUCKET_POS(ca, ptr); if (flags & BTREE_TRIGGER_transactional) { + struct extent_ptr_decoded p = { + .ptr = *ptr, + .crc = bch2_extent_crc_unpack(s.k, NULL), + }; + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p, + (const union bch_extent_entry *) ptr, &bp); + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: + bch2_bucket_backpointer_mod(trans, s.s_c, &bp, + !(flags & BTREE_TRIGGER_overwrite)); + if (ret) + goto err; } if (flags & BTREE_TRIGGER_gc) { @@ -366,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans, return 0; } -static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) -{ - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->disk_label = s->disk_label; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); -} - int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, @@ -399,6 +400,15 @@ int bch2_trigger_stripe(struct btree_trans *trans, (new_s->nr_blocks != old_s->nr_blocks || new_s->nr_redundant != old_s->nr_redundant)); + if (flags & BTREE_TRIGGER_transactional) { + int ret = bch2_lru_change(trans, + BCH_LRU_STRIPE_FRAGMENTATION, + idx, + stripe_lru_pos(old_s), + stripe_lru_pos(new_s)); + if (ret) + return ret; + } if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { /* @@ -472,38 +482,6 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; } - if (flags & BTREE_TRIGGER_atomic) { - struct stripe *m = genradix_ptr(&c->stripes, idx); - - if (!m) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_bkey_val_to_text(&buf1, c, old); - bch2_bkey_val_to_text(&buf2, c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" - "old %s\n" - "new %s", idx, buf1.buf, buf2.buf); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - bch2_inconsistent_error(c); - return -1; - } - - if (!new_s) { - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - } else { - stripe_to_mem(m, new_s); - - if (!old_s) - bch2_stripes_heap_insert(c, m, idx); - else - bch2_stripes_heap_update(c, m, idx); - } - } - return 0; } @@ -726,14 +704,15 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "erasure coding %s error: %s", + bch2_account_io_completion(ca, bio_data_dir(bio), + ec_bio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); clear_bit(ec_bio->idx, ec_bio->buf->valid); + } int stale = dev_ptr_stale(ca, ptr); if (stale) { @@ -796,6 +775,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ec_bio->ca = ca; ec_bio->buf = buf; ec_bio->idx = idx; + ec_bio->submit_time = local_clock(); ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ec_bio->bio.bi_end_io = ec_block_endio; @@ -917,26 +897,6 @@ err: static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { - ec_stripes_heap n, *h = &c->ec_stripes_heap; - - if (idx >= h->size) { - if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - - mutex_lock(&c->ec_stripes_heap_lock); - if (n.size > h->size) { - memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); - n.nr = h->nr; - swap(*h, n); - } - mutex_unlock(&c->ec_stripes_heap_lock); - - free_heap(&n); - } - - if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; @@ -1009,180 +969,50 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) s->idx = 0; } -/* Heap of all existing stripes, ordered by blocks_nonempty */ - -static u64 stripe_idx_to_delete(struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - - lockdep_assert_held(&c->ec_stripes_heap_lock); - - if (h->nr && - h->data[0].blocks_nonempty == 0 && - !bch2_stripe_is_open(c, h->data[0].idx)) - return h->data[0].idx; - - return 0; -} - -static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, - size_t i) -{ - struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); - - genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; -} - -static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - - return ((_l->blocks_nonempty > _r->blocks_nonempty) < - (_l->blocks_nonempty < _r->blocks_nonempty)); -} - -static inline void ec_stripes_heap_swap(void *l, void *r, void *h) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - ec_stripes_heap *_h = (ec_stripes_heap *)h; - size_t i = _l - _h->data; - size_t j = _r - _h->data; - - swap(*_l, *_r); - - ec_stripes_heap_set_backpointer(_h, i); - ec_stripes_heap_set_backpointer(_h, j); -} - -static const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, -}; - -static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m = genradix_ptr(&c->stripes, idx); - - BUG_ON(m->heap_idx >= h->nr); - BUG_ON(h->data[m->heap_idx].idx != idx); -} - -void bch2_stripes_heap_del(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_insert(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - BUG_ON(min_heap_full(&c->ec_stripes_heap)); - - genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; - min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { - .idx = idx, - .blocks_nonempty = m->blocks_nonempty, - }), - &callbacks, - &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_update(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - bool do_deletes; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; - - i = m->heap_idx; - min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); - min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - - do_deletes = stripe_idx_to_delete(c) != 0; - mutex_unlock(&c->ec_stripes_heap_lock); - - if (do_deletes) - bch2_do_stripe_deletes(c); -} - /* stripe deletion */ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) { - struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_stripe s; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), + BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_stripe) { - bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); - ret = -EINVAL; - goto err; - } - - s = bkey_s_c_to_stripe(k); - for (unsigned i = 0; i < s.v->nr_blocks; i++) - if (stripe_blockcount_get(s.v, i)) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0); + /* + * We expect write buffer races here + * Important: check stripe_is_open with stripe key locked: + */ + if (k.k->type == KEY_TYPE_stripe && + !bch2_stripe_is_open(trans->c, idx) && + stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) + ret = bch2_btree_delete_at(trans, &iter, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; } +/* + * XXX + * can we kill this and delete stripes from the trigger? + */ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - while (1) { - mutex_lock(&c->ec_stripes_heap_lock); - u64 idx = stripe_idx_to_delete(c); - mutex_unlock(&c->ec_stripes_heap_lock); - - if (!idx) - break; - - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - ec_stripe_delete(trans, idx)); - bch_err_fn(c, ret); - if (ret) - break; - } - + bch2_trans_run(c, + bch2_btree_write_buffer_tryflush(trans) ?: + for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), + 0, lru_k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + ec_stripe_delete(trans, lru_k.k->p.offset); + }))); bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } @@ -1294,7 +1124,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_erasure_coding_found_btree_node; } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1360,7 +1190,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) - return -EIO; + return -BCH_ERR_ENOENT_dev_not_found; struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1380,8 +1210,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b if (bp_k.k->type != KEY_TYPE_backpointer) continue; + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); + if (bp.v->btree_id == BTREE_ID_stripes) + continue; + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, - bkey_s_c_to_backpointer(bp_k), &last_flushed); + bp, &last_flushed); })); bch2_bkey_buf_exit(&last_flushed, c); @@ -1393,21 +1227,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { struct btree_trans *trans = bch2_trans_get(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - int ret = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; - ret = bch2_btree_write_buffer_flush_sync(trans); + int ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; - for (i = 0; i < nr_data; i++) { + for (unsigned i = 0; i < nr_data; i++) { ret = ec_stripe_update_bucket(trans, s, i); if (ret) break; } err: bch2_trans_put(trans); - return ret; } @@ -1473,6 +1305,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (s->err) { if (!bch2_err_matches(s->err, EROFS)) bch_err(c, "error creating stripe: error writing data buckets"); + ret = s->err; goto err; } @@ -1481,6 +1314,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); + ret = -BCH_ERR_ec_block_read; goto err; } @@ -1506,6 +1340,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); + ret = -BCH_ERR_ec_block_write; goto err; } @@ -1527,6 +1362,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ret) goto err; err: + trace_stripe_create(c, s->idx, ret); + bch2_disk_reservation_put(c, &s->res); for (i = 0; i < v->nr_blocks; i++) @@ -1612,11 +1449,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int ec_stripe_new_set_pending(c, h); } -void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) { struct ec_stripe_new *s = ob->ec; - s->err = -EIO; + s->err = err; } void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) @@ -1968,39 +1805,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, return 0; } -static s64 get_existing_stripe(struct bch_fs *c, - struct ec_stripe_head *head) +static int __get_existing_stripe(struct btree_trans *trans, + struct ec_stripe_head *head, + struct ec_stripe_buf *stripe, + u64 idx) { - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t heap_idx; - u64 stripe_idx; - s64 ret = -1; - - if (may_create_new_stripe(c)) - return -1; + struct bch_fs *c = trans->c; - mutex_lock(&c->ec_stripes_heap_lock); - for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { - /* No blocks worth reusing, stripe will just be deleted: */ - if (!h->data[heap_idx].blocks_nonempty) - continue; + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), 0); + int ret = bkey_err(k); + if (ret) + goto err; - stripe_idx = h->data[heap_idx].idx; + /* We expect write buffer races here */ + if (k.k->type != KEY_TYPE_stripe) + goto out; - m = genradix_ptr(&c->stripes, stripe_idx); + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + if (stripe_lru_pos(s.v) <= 1) + goto out; - if (m->disk_label == head->disk_label && - m->algorithm == head->algo && - m->nr_redundant == head->redundancy && - m->sectors == head->blocksize && - m->blocks_nonempty < m->nr_blocks - m->nr_redundant && - bch2_try_open_stripe(c, head->s, stripe_idx)) { - ret = stripe_idx; - break; - } + if (s.v->disk_label == head->disk_label && + s.v->algorithm == head->algo && + s.v->nr_redundant == head->redundancy && + le16_to_cpu(s.v->sectors) == head->blocksize && + bch2_try_open_stripe(c, head->s, idx)) { + bkey_reassemble(&stripe->key, k); + ret = 1; } - mutex_unlock(&c->ec_stripes_heap_lock); +out: + bch2_set_btree_iter_dontneed(&iter); +err: + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2052,24 +1890,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri struct ec_stripe_new *s) { struct bch_fs *c = trans->c; - s64 idx; - int ret; /* * If we can't allocate a new stripe, and there's no stripes with empty * blocks for us to reuse, that means we have to wait on copygc: */ - idx = get_existing_stripe(c, h); - if (idx < 0) - return -BCH_ERR_stripe_alloc_blocked; + if (may_create_new_stripe(c)) + return -1; - ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); - bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, - "reading stripe key: %s", bch2_err_str(ret)); - if (ret) { - bch2_stripe_close(c, s); - return ret; + struct btree_iter lru_iter; + struct bkey_s_c lru_k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), + 0, lru_k, ret) { + ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); + if (ret) + break; } + bch2_trans_iter_exit(trans, &lru_iter); + if (!ret) + ret = -BCH_ERR_stripe_alloc_blocked; + if (ret == 1) + ret = 0; + if (ret) + return ret; return init_new_stripe_from_existing(c, s); } @@ -2367,46 +2214,7 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_prefetch, k, ({ - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - - stripe_to_mem(m, bkey_s_c_to_stripe(k).v); - - bch2_stripes_heap_insert(c, m, k.k->p.offset); - 0; - }))); - bch_err_fn(c, ret); - return ret; -} - -void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->nr, 50); i++) { - m = genradix_ptr(&c->stripes, h->data[i].idx); - - prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, - h->data[i].blocks_nonempty, - m->nr_blocks - m->nr_redundant, - m->nr_redundant); - if (bch2_stripe_is_open(c, h->data[i].idx)) - prt_str(out, " open"); - prt_newline(out); - } - mutex_unlock(&c->ec_stripes_heap_lock); + return 0; } static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -2477,15 +2285,12 @@ void bch2_fs_ec_exit(struct bch_fs *c) BUG_ON(!list_empty(&c->ec_stripe_new_list)); - free_heap(&c->ec_stripes_heap); - genradix_free(&c->stripes); bioset_exit(&c->ec_bioset); } void bch2_fs_ec_init_early(struct bch_fs *c) { spin_lock_init(&c->ec_stripes_new_lock); - mutex_init(&c->ec_stripes_heap_lock); INIT_LIST_HEAD(&c->ec_stripe_head_list); mutex_init(&c->ec_stripe_head_lock); @@ -2503,3 +2308,40 @@ int bch2_fs_ec_init(struct bch_fs *c) return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), BIOSET_NEED_BVECS); } + +static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, + struct bkey_s_c k, + struct bkey_buf *last_flushed) +{ + if (k.k->type != KEY_TYPE_stripe) + return 0; + + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + u64 lru_idx = stripe_lru_pos(s.v); + if (lru_idx) { + int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION, + k.k->p.offset, lru_idx, k, last_flushed); + if (ret) + return ret; + } + return 0; +} + +int bch2_check_stripe_to_lru_refs(struct bch_fs *c) +{ + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, + POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 583ca6a226da..62d27e04d763 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -92,6 +92,29 @@ static inline void stripe_csum_set(struct bch_stripe *s, memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); } +#define STRIPE_LRU_POS_EMPTY 1 + +static inline u64 stripe_lru_pos(const struct bch_stripe *s) +{ + if (!s) + return 0; + + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; + + for (unsigned i = 0; i < nr_data; i++) + blocks_empty += !stripe_blockcount_get(s, i); + + /* Will be picked up by the stripe_delete worker */ + if (blocks_empty == nr_data) + return STRIPE_LRU_POS_EMPTY; + + if (!blocks_empty) + return 0; + + /* invert: more blocks empty = reuse first */ + return LRU_TIME_MAX - blocks_empty; +} + static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, const struct bch_extent_ptr *data_ptr, unsigned sectors) @@ -132,6 +155,20 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, m->sectors); } +static inline void gc_stripe_unlock(struct gc_stripe *s) +{ + BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); + + clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); + wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); +} + +static inline void gc_stripe_lock(struct gc_stripe *s) +{ + wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR, + TASK_UNINTERRUPTIBLE); +} + struct bch_read_bio; struct ec_stripe_buf { @@ -212,7 +249,7 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); @@ -221,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, unsigned, unsigned, unsigned, enum bch_watermark, struct closure *); -void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); - void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); @@ -261,11 +294,12 @@ void bch2_fs_ec_flush(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); -void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_ec_exit(struct bch_fs *); void bch2_fs_ec_init_early(struct bch_fs *); int bch2_fs_ec_init(struct bch_fs *); +int bch2_check_stripe_to_lru_refs(struct bch_fs *); + #endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 8d1e70e830ac..06144bfd9c19 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -20,23 +20,15 @@ struct stripe { }; struct gc_stripe { + u8 lock; + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 sectors; - u8 nr_blocks; u8 nr_redundant; - - unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; struct bch_replicas_padded r; }; -struct ec_stripe_heap_entry { - size_t idx; - unsigned blocks_nonempty; -}; - -typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap; - #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 4590cd0c7c90..101806d7ebe1 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -116,9 +116,11 @@ x(ENOENT, ENOENT_snapshot_tree) \ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_bucket_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ + x(ENOENT, btree_node_dying) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(EEXIST, EEXIST_str_hash_set) \ @@ -180,6 +182,12 @@ x(EINVAL, not_in_recovery) \ x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ + x(BCH_ERR_data_update_done, data_update_done_would_block) \ + x(BCH_ERR_data_update_done, data_update_done_unwritten) \ + x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ + x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ + x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ + x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ @@ -200,6 +208,8 @@ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ + x(EINVAL, erasure_coding_found_btree_node) \ + x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -210,10 +220,18 @@ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ - x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ - x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ - x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ - x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ + x(BCH_ERR_operation_blocked, journal_res_blocked) \ + x(BCH_ERR_journal_res_blocked, journal_blocked) \ + x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \ + x(BCH_ERR_journal_res_blocked, journal_max_open) \ + x(BCH_ERR_journal_res_blocked, journal_full) \ + x(BCH_ERR_journal_res_blocked, journal_pin_full) \ + x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ + x(BCH_ERR_journal_res_blocked, journal_stuck) \ + x(BCH_ERR_journal_res_blocked, journal_retry_open) \ + x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ + x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ + x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ x(BCH_ERR_invalid, invalid_sb) \ x(BCH_ERR_invalid_sb, invalid_sb_magic) \ x(BCH_ERR_invalid_sb, invalid_sb_version) \ @@ -223,6 +241,7 @@ x(BCH_ERR_invalid_sb, invalid_sb_csum) \ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ + x(BCH_ERR_invalid_sb, invalid_sb_offset) \ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ @@ -250,6 +269,7 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, journal_shutdown) \ x(EIO, journal_flush_err) \ + x(EIO, journal_write_err) \ x(EIO, btree_node_read_err) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ @@ -258,17 +278,52 @@ x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ + x(EIO, trigger_alloc) \ x(EIO, trigger_pointer) \ x(EIO, trigger_stripe_pointer) \ x(EIO, metadata_bucket_inconsistency) \ x(EIO, mark_stripe) \ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ - x(EIO, no_device_to_read_from) \ + x(EIO, extent_poisened) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ x(EIO, insufficient_journal_devices) \ + x(EIO, device_offline) \ + x(EIO, EIO_fault_injected) \ + x(EIO, ec_block_read) \ + x(EIO, ec_block_write) \ + x(EIO, recompute_checksum) \ + x(EIO, decompress) \ + x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ + x(BCH_ERR_decompress, decompress_lz4) \ + x(BCH_ERR_decompress, decompress_gzip) \ + x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ + x(BCH_ERR_decompress, decompress_zstd) \ + x(EIO, data_write) \ + x(BCH_ERR_data_write, data_write_io) \ + x(BCH_ERR_data_write, data_write_csum) \ + x(BCH_ERR_data_write, data_write_invalid_ptr) \ + x(BCH_ERR_data_write, data_write_misaligned) \ + x(BCH_ERR_decompress, data_read) \ + x(BCH_ERR_data_read, no_device_to_read_from) \ + x(BCH_ERR_data_read, data_read_io_err) \ + x(BCH_ERR_data_read, data_read_csum_err) \ + x(BCH_ERR_data_read, data_read_retry) \ + x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ + x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ + x(BCH_ERR_data_read, data_read_decompress_err) \ + x(BCH_ERR_data_read, data_read_decrypt_err) \ + x(BCH_ERR_data_read, data_read_ptr_stale_race) \ + x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ + x(BCH_ERR_data_read, data_read_no_encryption_key) \ + x(BCH_ERR_data_read, data_read_buffer_too_small) \ + x(BCH_ERR_data_read, data_read_key_overwritten) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 038da6a61f6b..207f35d3cce2 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -3,8 +3,8 @@ #include "btree_cache.h" #include "btree_iter.h" #include "error.h" -#include "fs-common.h" #include "journal.h" +#include "namei.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" @@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_fs *c = ca->fs; - bool dev; + + /* XXX: if it's reads or checksums that are failing, set it to failed */ down_write(&c->state_lock); - dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - if (dev - ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED) - : bch2_fs_emergency_read_only(c)) + unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); + + if (write_errors_start && + time_after(jiffies, + write_errors_start + c->opts.write_error_timeout * HZ)) { + if (ca->mi.state >= BCH_MEMBER_STATE_ro) + goto out; + + bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + bch_err(ca, - "too many IO errors, setting %s RO", + "writes erroring for %u seconds, setting %s ro", + c->opts.write_error_timeout, dev ? "device" : "filesystem"); + if (!dev) + bch2_fs_emergency_read_only(c); + + } +out: up_write(&c->state_lock); } void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) { atomic64_inc(&ca->errors[type]); - //queue_work(system_long_wq, &ca->io_error_work); + + if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) + ca->write_errors_start = jiffies; + + queue_work(system_long_wq, &ca->io_error_work); } enum ask_yn { @@ -530,35 +546,59 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_unlock(&c->fsck_error_msgs_lock); } -int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) +int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + subvol_inum inum, u64 offset) { u32 restart_count = trans->restart_count; int ret = 0; - /* XXX: we don't yet attempt to print paths when we don't know the subvol */ - if (inum.subvol) - ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (inum.subvol) { + ret = bch2_inum_to_path(trans, inum, out); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } if (!inum.subvol || ret) prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); + prt_printf(out, " offset %llu: ", offset); return trans_was_restarted(trans, restart_count); } -int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - subvol_inum inum, u64 offset) +void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, + subvol_inum inum, u64 offset) { - int ret = bch2_inum_err_msg_trans(trans, out, inum); - prt_printf(out, " offset %llu: ", offset); - return ret; + bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); } -void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bpos pos) { - bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); + struct bch_fs *c = trans->c; + int ret = 0; + + if (!bch2_snapshot_is_leaf(c, pos.snapshot)) + prt_str(out, "(multiple snapshots) "); + + subvol_inum inum = { + .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), + .inum = pos.inode, + }; + + if (inum.subvol) { + ret = bch2_inum_to_path(trans, inum, out); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } + + if (!inum.subvol || ret) + prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); + + prt_printf(out, " offset %llu: ", pos.offset << 8); + return 0; } -void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, - subvol_inum inum, u64 offset) +void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, + struct bpos pos) { - bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); + bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 7acf2a27ca28..7d3f0e2a5fd6 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -216,32 +216,43 @@ void bch2_io_error_work(struct work_struct *); /* Does the error handling without logging a message */ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); -#define bch2_dev_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_dev_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) - -#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif + +static inline void bch2_account_io_success_fail(struct bch_dev *ca, + enum bch_member_error_type type, + bool success) +{ + if (likely(success)) { + if (type == BCH_MEMBER_ERROR_write && + ca->write_errors_start) + ca->write_errors_start = 0; + } else { + bch2_io_error(ca, type); + } +} + +static inline void bch2_account_io_completion(struct bch_dev *ca, + enum bch_member_error_type type, + u64 submit_time, bool success) +{ + if (unlikely(!ca)) + return; + + if (type != BCH_MEMBER_ERROR_checksum) + bch2_latency_acct(ca, submit_time, type); + + bch2_account_io_success_fail(ca, type, success); +} -int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); -void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); +void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); + #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 2d8042f853dc..ae1a1d917805 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -28,6 +28,13 @@ #include "trace.h" #include "util.h" +static const char * const bch2_extent_flags_strs[] = { +#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, + BCH_EXTENT_FLAGS() +#undef x + NULL, +}; + static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -51,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, } void bch2_mark_io_failure(struct bch_io_failures *failed, - struct extent_ptr_decoded *p) + struct extent_ptr_decoded *p, + bool csum_error) { struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); @@ -59,53 +67,57 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); f = &failed->devs[failed->nr++]; - f->dev = p->ptr.dev; - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else if (p->idx != f->idx) { - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else { - f->nr_failed++; + memset(f, 0, sizeof(*f)); + f->dev = p->ptr.dev; } + + if (p->do_ec_reconstruct) + f->failed_ec = true; + else if (!csum_error) + f->failed_io = true; + else + f->failed_csum_nr++; } -static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +static inline u64 dev_latency(struct bch_dev *ca) { - struct bch_dev *ca = bch2_dev_rcu(c, dev); return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; } +static inline int dev_failed(struct bch_dev *ca) +{ + return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; +} + /* * returns true if p1 is better than p2: */ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, - const struct extent_ptr_decoded p2) + u64 p1_latency, + struct bch_dev *ca1, + const struct extent_ptr_decoded p2, + u64 p2_latency) { - if (likely(!p1.idx && !p2.idx)) { - u64 l1 = dev_latency(c, p1.ptr.dev); - u64 l2 = dev_latency(c, p2.ptr.dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - /* - * Square the latencies, to bias more in favor of the faster - * device - we never want to stop issuing reads to the slower - * device altogether, so that we can update our latency numbers: - */ - l1 *= l1; - l2 *= l2; + int failed_delta = dev_failed(ca1) - dev_failed(ca2); + if (unlikely(failed_delta)) + return failed_delta < 0; - /* Pick at random, biased in favor of the faster device: */ + if (unlikely(bch2_force_reconstruct_read)) + return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - return bch2_get_random_u64_below(l1 + l2) > l1; - } + if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) + return p1.do_ec_reconstruct < p2.do_ec_reconstruct; + + int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; + if (unlikely(crc_retry_delta)) + return crc_retry_delta < 0; - if (bch2_force_reconstruct_read) - return p1.idx > p2.idx; + /* Pick at random, biased in favor of the faster device: */ - return p1.idx < p2.idx; + return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; } /* @@ -115,64 +127,108 @@ static inline bool ptr_better(struct bch_fs *c, */ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct bch_io_failures *failed, - struct extent_ptr_decoded *pick) + struct extent_ptr_decoded *pick, + int dev) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_dev_io_failures *f; - int ret = 0; + bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; + bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return -BCH_ERR_extent_poisened; + rcu_read_lock(); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 pick_latency; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + have_dirty_ptrs |= !p.ptr.cached; + /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ if (p.ptr.unwritten) { - ret = 0; - break; + rcu_read_unlock(); + return 0; } - /* - * If there are any dirty pointers it's an error if we can't - * read: - */ - if (!ret && !p.ptr.cached) - ret = -BCH_ERR_no_device_to_read_from; + /* Are we being asked to read from a specific device? */ + if (dev >= 0 && p.ptr.dev != dev) + continue; struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; - f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; - if (f) - p.idx = f->nr_failed < f->nr_retries - ? f->idx - : f->idx + 1; + struct bch_dev_io_failures *f = + unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; + if (unlikely(f)) { + p.crc_retry_nr = f->failed_csum_nr; + p.has_ec &= ~f->failed_ec; - if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) - p.idx++; + if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { + have_io_errors |= f->failed_io; + have_io_errors |= f->failed_ec; + } + have_csum_errors |= !!f->failed_csum_nr; - if (!p.idx && p.has_ec && bch2_force_reconstruct_read) - p.idx++; + if (p.has_ec && (f->failed_io || f->failed_csum_nr)) + p.do_ec_reconstruct = true; + else if (f->failed_io || + f->failed_csum_nr > c->opts.checksum_err_retry_nr) + continue; + } - if (p.idx > (unsigned) p.has_ec) - continue; + have_missing_devs |= ca && !bch2_dev_is_online(ca); - if (ret > 0 && !ptr_better(c, p, *pick)) - continue; + if (!ca || !bch2_dev_is_online(ca)) { + if (!p.has_ec) + continue; + p.do_ec_reconstruct = true; + } + + if (bch2_force_reconstruct_read && p.has_ec) + p.do_ec_reconstruct = true; - *pick = p; - ret = 1; + u64 p_latency = dev_latency(ca); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + p_latency *= p_latency; + + if (!have_pick || + ptr_better(c, + p, p_latency, ca, + *pick, pick_latency)) { + *pick = p; + pick_latency = p_latency; + have_pick = true; + } } rcu_read_unlock(); - return ret; + if (have_pick) + return 1; + if (!have_dirty_ptrs) + return 0; + if (have_missing_devs) + return -BCH_ERR_no_device_to_read_from; + if (have_csum_errors) + return -BCH_ERR_data_read_csum_err; + if (have_io_errors) + return -BCH_ERR_data_read_io_err; + + WARN_ONCE(1, "unhandled error case in %s\n", __func__); + return -EINVAL; } /* KEY_TYPE_btree_ptr: */ @@ -536,29 +592,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, struct bch_extent_crc_unpacked src, enum bch_extent_entry_type type) { -#define set_common_fields(_dst, _src) \ - _dst.type = 1 << type; \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset +#define common_fields(_src) \ + .type = BIT(type), \ + .csum_type = _src.csum_type, \ + .compression_type = _src.compression_type, \ + ._compressed_size = _src.compressed_size - 1, \ + ._uncompressed_size = _src.uncompressed_size - 1, \ + .offset = _src.offset switch (type) { case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); + dst->crc32 = (struct bch_extent_crc32) { + common_fields(src), + .csum = (u32 __force) *((__le32 *) &src.csum.lo), + }; break; case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = (u64 __force) src.csum.lo; - dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); + dst->crc64 = (struct bch_extent_crc64) { + common_fields(src), + .nonce = src.nonce, + .csum_lo = (u64 __force) src.csum.lo, + .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), + }; break; case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; + dst->crc128 = (struct bch_extent_crc128) { + common_fields(src), + .nonce = src.nonce, + .csum = src.csum, + }; break; default: BUG(); @@ -997,7 +1059,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); + return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); } void bch2_extent_ptr_set_cached(struct bch_fs *c, @@ -1220,6 +1282,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, bch2_extent_rebalance_to_text(out, c, &entry->rebalance); break; + case BCH_EXTENT_ENTRY_flags: + prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); + break; + default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; @@ -1381,6 +1447,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, #endif break; } + case BCH_EXTENT_ENTRY_flags: + bkey_fsck_err_on(entry != ptrs.start, + c, extent_flags_not_at_start, + "extent flags entry not at start"); + break; } } @@ -1447,6 +1518,28 @@ void bch2_ptr_swab(struct bkey_s k) } } +int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) +{ + int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); + if (ret) + return ret; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { + ptrs.start->flags.flags = flags; + } else { + struct bch_extent_flags f = { + .type = BIT(BCH_EXTENT_ENTRY_flags), + .flags = flags, + }; + __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); + } + + return 0; +} + /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) @@ -1492,8 +1585,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: - break; case BCH_EXTENT_ENTRY_rebalance: + case BCH_EXTENT_ENTRY_flags: break; } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 204d765dd74c..e78a39e7e18f 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ({ \ __label__ out; \ \ - (_ptr).idx = 0; \ - (_ptr).has_ec = false; \ + (_ptr).has_ec = false; \ + (_ptr).do_ec_reconstruct = false; \ + (_ptr).crc_retry_nr = 0; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (__extent_entry_type(_entry)) { \ @@ -401,10 +402,10 @@ out: \ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, bool); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, int); /* KEY_TYPE_btree_ptr: */ @@ -753,4 +754,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size) k->size = new_size; } +static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) +{ + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) + return ptrs.start->flags.flags; + return 0; +} + +static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) +{ + return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); +} + +int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); + #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index c198dfc376d6..74c0252cbd98 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -79,8 +79,9 @@ x(crc64, 2) \ x(crc128, 3) \ x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 + x(rebalance, 5) \ + x(flags, 6) +#define BCH_EXTENT_ENTRY_MAX 7 enum bch_extent_entry_type { #define x(f, n) BCH_EXTENT_ENTRY_##f = n, @@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr { #endif }; +#define BCH_EXTENT_FLAGS() \ + x(poisoned, 0) + +enum bch_extent_flags_e { +#define x(n, v) BCH_EXTENT_FLAG_##n = v, + BCH_EXTENT_FLAGS() +#undef x +}; + +struct bch_extent_flags { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:7, + flags:57; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 flags:57, + type:7; +#endif +}; + /* bch_extent_rebalance: */ #include "rebalance_format.h" diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index 43d6c341ecca..e51529dca4c2 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked { }; struct extent_ptr_decoded { - unsigned idx; bool has_ec; + bool do_ec_reconstruct; + u8 crc_retry_nr; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; struct bch_extent_stripe_ptr ec; @@ -31,10 +32,10 @@ struct bch_io_failures { u8 nr; struct bch_dev_io_failures { u8 dev; - u8 idx; - u8 nr_failed; - u8 nr_retries; - } devs[BCH_REPLICAS_MAX]; + unsigned failed_csum_nr:6, + failed_io:1, + failed_ec:1; + } devs[BCH_REPLICAS_MAX + 1]; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 2eaffe37b5e7..0e742555cb0a 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr return cmp(a, b, priv); } -static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, +static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, cmp_r_func_t cmp_func, const void *priv, size_t l, size_t r) { - return do_cmp(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, cmp_func, priv); } -static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, +static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, swap_r_func_t swap_func, const void *priv, size_t l, size_t r) { - do_swap(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + do_swap(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, size, swap_func, priv); } -void eytzinger0_sort_r(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) +static void eytzinger1_sort_r(void *base1, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) { - int i, j, k; + unsigned i, j, k; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) swap_func = NULL; if (!swap_func) { - if (is_aligned(base, size, 8)) + if (is_aligned(base1, size, 8)) swap_func = SWAP_WORDS_64; - else if (is_aligned(base, size, 4)) + else if (is_aligned(base1, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; } /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { + for (i = n / 2; i >= 1; --i) { /* Find the sift-down path all the way to the leaves. */ - for (j = i; k = j * 2 + 1, k + 1 < n;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + for (j = i; k = j * 2, k < n;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == n) - j = j * 2 + 1; + if (j * 2 == n) + j *= 2; /* Backtrack to the correct location. */ - while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) - j = (j - 1) / 2; + while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ for (k = j; j != i;) { - j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } /* sort */ - for (i = n - 1; i > 0; --i) { - eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + for (i = n; i > 1; --i) { + eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); /* Find the sift-down path all the way to the leaves. */ - for (j = 0; k = j * 2 + 1, k + 1 < i;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + for (j = 1; k = j * 2, k + 1 < i;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == i) - j = j * 2 + 1; + if (j * 2 + 1 == i) + j *= 2; /* Backtrack to the correct location. */ - while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) - j = (j - 1) / 2; + while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ - for (k = j; j;) { - j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + for (k = j; j > 1;) { + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } } +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + void *base1 = base - size; + + return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); +} + void eytzinger0_sort(void *base, size_t n, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 0541192d7bc0..643c1f716061 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -6,6 +6,7 @@ #include <linux/log2.h> #ifdef EYTZINGER_DEBUG +#include <linux/bug.h> #define EYTZINGER_BUG_ON(cond) BUG_ON(cond) #else #define EYTZINGER_BUG_ON(cond) @@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size) return rounddown_pow_of_two(size + 1) - 1; } -/* - * eytzinger1_next() and eytzinger1_prev() have the nice properties that - * - * eytzinger1_next(0) == eytzinger1_first()) - * eytzinger1_prev(0) == eytzinger1_last()) - * - * eytzinger1_prev(eytzinger1_first()) == 0 - * eytzinger1_next(eytzinger1_last()) == 0 - */ - static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i >>= i > size; } else { i >>= ffz(i) + 1; @@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i -= 1; i >>= i > size; } else { @@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) +#define eytzinger0_for_each_prev(_i, _size) \ + for (unsigned (_i) = eytzinger0_last((_size)); \ + (_i) != -1; \ + (_i) = eytzinger0_prev((_i), (_size))) + /* return greatest node <= @search, or -1 if not found */ static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - unsigned i, n = 0; - - if (!nr) - return -1; - - do { - i = n; - n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); - } while (n < nr); - - if (n & 1) { - /* - * @i was greater than @search, return previous node: - * - * if @i was leftmost/smallest element, - * eytzinger0_prev(eytzinger0_first())) returns -1, as expected - */ - return eytzinger0_prev(i, nr); - } else { - return i; - } + void *base1 = base - size; + unsigned n = 1; + + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); + n >>= __ffs(n) + 1; + return n - 1; } +/* return smallest node > @search, or -1 if not found */ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + void *base1 = base - size; + unsigned n = 1; - /* - * if eytitzinger0_find_le() returned -1 - no element was <= search - we - * want to return the first element; next/prev identities mean this work - * as expected - * - * similarly if find_le() returns last element, we should return -1; - * identities mean this all works out: - */ - return eytzinger0_next(idx, nr); + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); + n >>= __ffs(n + 1) + 1; + return n - 1; } +/* return smallest node >= @search, or -1 if not found */ static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); - - if (idx < nr && !cmp(base + idx * size, search)) - return idx; + void *base1 = base - size; + unsigned n = 1; - return eytzinger0_next(idx, nr); + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); + n >>= __ffs(n + 1) + 1; + return n - 1; } #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ - void *_base = (base); \ + size_t _size = (size); \ + void *_base1 = (void *)(base) - _size; \ const void *_search = (search); \ size_t _nr = (nr); \ - size_t _size = (size); \ - size_t _i = 0; \ + size_t _i = 1; \ int _res; \ \ - while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size))) \ - _i = eytzinger0_child(_i, _res > 0); \ - _i; \ + while (_i <= _nr && \ + (_res = _cmp(_search, _base1 + _i * _size))) \ + _i = eytzinger1_child(_i, _res > 0); \ + _i - 1; \ }) void eytzinger0_sort_r(void *, size_t, size_t, diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index ab1d5db2fa56..5ab1c73c8d4c 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans, if (!get_more) break; + unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); + + if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) + break; + + unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); + + /* ensure proper alignment */ + order = min(order, __ffs(folio_offset|BIT(31))); + folio = xa_load(&iter->mapping->i_pages, folio_offset); if (folio && !xa_is_value(folio)) break; - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); if (!folio) break; @@ -149,12 +159,10 @@ static void bchfs_read(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; - int flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE; + int flags = BCH_READ_retry_if_stale| + BCH_READ_may_promote; int ret = 0; - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); @@ -211,14 +219,14 @@ static void bchfs_read(struct btree_trans *trans, swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; bch2_bio_page_state_set(&rbio->bio, k); bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; swap(rbio->bio.bi_iter.bi_size, bytes); @@ -232,7 +240,8 @@ err: if (ret) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); prt_printf(&buf, "read error %i from btree lookup", ret); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); @@ -280,12 +289,13 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_readpages_end_io); readpage_iter_advance(&readpages_iter); rbio->bio.bi_iter.bi_sector = folio_sector(folio); - rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bchfs_read(trans, rbio, inode_inum(inode), @@ -323,10 +333,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_read_single_folio_end_io); rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); @@ -420,7 +430,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) } } - if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + if (io->op.flags & BCH_WRITE_wrote_data_inline) { bio_for_each_folio_all(fi, bio) { struct bch_folio *s; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 2089c36b5866..535bc5fcbcc0 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -73,6 +73,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct blk_plug plug; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); + bool split = false; size_t shorten; ssize_t ret; @@ -99,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) GFP_KERNEL, &c->dio_read_bioset); - bio->bi_end_io = bch2_direct_IO_read_endio; - dio = container_of(bio, struct dio_read, rbio.bio); closure_init(&dio->cl, NULL); @@ -133,12 +132,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { + split = true; + bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), REQ_OP_READ, GFP_KERNEL, &c->bio_read); - bio->bi_end_io = bch2_direct_IO_read_split_endio; start: bio->bi_opf = REQ_OP_READ|REQ_SYNC; bio->bi_iter.bi_sector = offset >> 9; @@ -160,7 +160,15 @@ start: if (iter->count) closure_get(&dio->cl); - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + struct bch_read_bio *rbio = + rbio_init(bio, + c, + opts, + split + ? bch2_direct_IO_read_split_endio + : bch2_direct_IO_read_endio); + + bch2_read(c, rbio, inode_inum(inode)); } blk_finish_plug(&plug); @@ -511,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) dio->op.devs_need_flush = &inode->ei_devs_need_flush; if (sync) - dio->op.flags |= BCH_WRITE_SYNC; - dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + dio->op.flags |= BCH_WRITE_sync; + dio->op.flags |= BCH_WRITE_check_enospc; ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, bio_sectors(bio), true); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 15725b4ce393..f45054cee746 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -5,8 +5,8 @@ #include "chardev.h" #include "dirent.h" #include "fs.h" -#include "fs-common.h" #include "fs-ioctl.h" +#include "namei.h" #include "quota.h" #include <linux/compat.h> @@ -54,6 +54,32 @@ static int bch2_inode_flags_set(struct btree_trans *trans, (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) return -EINVAL; + if ((newflags ^ oldflags) & BCH_INODE_casefolded) { +#ifdef CONFIG_UNICODE + int ret = 0; + /* Not supported on individual files. */ + if (!S_ISDIR(bi->bi_mode)) + return -EOPNOTSUPP; + + /* + * Make sure the dir is empty, as otherwise we'd need to + * rehash everything and update the dirent keys. + */ + ret = bch2_empty_dir_trans(trans, inode_inum(inode)); + if (ret < 0) + return ret; + + ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding); + if (ret) + return ret; + + bch2_check_set_feature(c, BCH_FEATURE_casefolding); +#else + printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); + return -EOPNOTSUPP; +#endif + } + if (s->set_projinherit) { bi->bi_fields_set &= ~(1 << Inode_opt_project); bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); @@ -218,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, int ret = 0; subvol_inum inum; - kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); if (!kname) return -ENOMEM; @@ -511,10 +537,6 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ret = -EXDEV; goto err; } - if (!d_is_positive(victim)) { - ret = -ENOENT; - goto err; - } ret = __bch2_unlink(dir, victim, true); if (!ret) { fsnotify_rmdir(dir, victim); diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h index d30f9bb056fd..ecd3bfdcde21 100644 --- a/fs/bcachefs/fs-ioctl.h +++ b/fs/bcachefs/fs-ioctl.h @@ -6,19 +6,21 @@ /* bcachefs inode flags -> vfs inode flags: */ static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_sync] = S_SYNC, - [__BCH_INODE_immutable] = S_IMMUTABLE, - [__BCH_INODE_append] = S_APPEND, - [__BCH_INODE_noatime] = S_NOATIME, + [__BCH_INODE_sync] = S_SYNC, + [__BCH_INODE_immutable] = S_IMMUTABLE, + [__BCH_INODE_append] = S_APPEND, + [__BCH_INODE_noatime] = S_NOATIME, + [__BCH_INODE_casefolded] = S_CASEFOLD, }; /* bcachefs inode flags -> FS_IOC_GETFLAGS: */ static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_sync] = FS_SYNC_FL, - [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, - [__BCH_INODE_append] = FS_APPEND_FL, - [__BCH_INODE_nodump] = FS_NODUMP_FL, - [__BCH_INODE_noatime] = FS_NOATIME_FL, + [__BCH_INODE_sync] = FS_SYNC_FL, + [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, + [__BCH_INODE_append] = FS_APPEND_FL, + [__BCH_INODE_nodump] = FS_NODUMP_FL, + [__BCH_INODE_noatime] = FS_NOATIME_FL, + [__BCH_INODE_casefolded] = FS_CASEFOLD_FL, }; /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 90ade8f648d9..c88c149d5de5 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -11,7 +11,6 @@ #include "errcode.h" #include "extents.h" #include "fs.h" -#include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fs-io-buffered.h" @@ -22,6 +21,7 @@ #include "io_read.h" #include "journal.h" #include "keylist.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "snapshot.h" @@ -641,7 +641,9 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, &inum); if (ret > 0) ret = -ENOENT; if (ret) @@ -651,30 +653,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (inode) goto out; + /* + * Note: if check/repair needs it, we commit before + * bch2_inode_hash_init_insert(), as after that point we can't take a + * restart - not in the top level loop with a commit_do(), like we + * usually do: + */ + struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + /* + * don't remove it: check_inodes might find another inode that points + * back to this dirent + */ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "dirent to missing inode:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); if (ret) goto err; - - /* regular files may have hardlinks: */ - if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && - !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), - c, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, &inode_u), - buf.buf))) { - ret = -ENOENT; - goto err; - } out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); @@ -698,6 +700,23 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, if (IS_ERR(inode)) inode = NULL; +#ifdef CONFIG_UNICODE + if (!inode && IS_CASEFOLDED(vdir)) { + /* + * Do not cache a negative dentry in casefolded directories + * as it would need to be invalidated in the following situation: + * - Lookup file "blAH" in a casefolded directory + * - Creation of file "BLAH" in a casefolded directory + * - Lookup file "blAH" in a casefolded directory + * which would fail if we had a negative dentry. + * + * We should come back to this when VFS has a method to handle + * this edgecase. + */ + return NULL; + } +#endif + return d_splice_alias(&inode->v, dentry); } @@ -858,10 +877,10 @@ err: return bch2_err_class(ret); } -static int bch2_mkdir(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, umode_t mode) +static struct dentry *bch2_mkdir(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, umode_t mode) { - return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); + return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0)); } static int bch2_rename2(struct mnt_idmap *idmap, @@ -1802,7 +1821,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, break; } - mapping_set_large_folios(inode->v.i_mapping); + mapping_set_folio_min_order(inode->v.i_mapping, + get_order(trans->c->opts.block_size)); } static void bch2_free_inode(struct inode *vinode) @@ -2008,44 +2028,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static int bch2_remount(struct super_block *sb, int *flags, - struct bch_opts opts) -{ - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - - if (opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts, errors)) - c->opts.errors = opts.errors; -err: - return bch2_err_class(ret); -} - static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; @@ -2192,6 +2174,9 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (ret) goto err; + if (opt_defined(opts, discard)) + set_bit(BCH_FS_discard_mount_opt_set, &c->flags); + /* Some options can't be parsed until after the fs is started: */ opts = bch2_opts_empty(); ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); @@ -2200,9 +2185,10 @@ static int bch2_fs_get_tree(struct fs_context *fc) bch2_opts_apply(&c->opts, opts); - ret = bch2_fs_start(c); - if (ret) - goto err_stop_fs; + /* + * need to initialise sb and set c->vfs_sb _before_ starting fs, + * for blk_holder_ops + */ sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); @@ -2264,6 +2250,10 @@ got_sb: sb->s_shrink->seeks = 0; + ret = bch2_fs_start(c); + if (ret) + goto err_put_super; + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); bch_err_msg(c, ret, "mounting: error getting root inode"); @@ -2351,8 +2341,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct bch2_opts_parse *opts = fc->fs_private; + struct bch_fs *c = sb->s_fs_info; + int ret = 0; + + opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); - return bch2_remount(sb, &fc->sb_flags, opts->opts); + if (opts->opts.read_only != c->opts.read_only) { + down_write(&c->state_lock); + + if (opts->opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= SB_RDONLY; + } else { + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + up_write(&c->state_lock); + ret = -EINVAL; + goto err; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts->opts.read_only; + + up_write(&c->state_lock); + } + + if (opt_defined(opts->opts, errors)) + c->opts.errors = opts->opts.errors; +err: + return bch2_err_class(ret); } static const struct fs_context_operations bch2_context_ops = { @@ -2396,7 +2417,7 @@ static struct file_system_type bcache_fs_type = { .name = "bcachefs", .init_fs_context = bch2_init_fs_context, .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS, }; MODULE_ALIAS_FS("bcachefs"); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0e85131d0af8..091057023fc5 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -10,10 +10,10 @@ #include "dirent.h" #include "error.h" #include "fs.h" -#include "fs-common.h" #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "namei.h" #include "recovery_passes.h" #include "snapshot.h" #include "super.h" @@ -23,13 +23,6 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { @@ -116,29 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode_nr) - break; - if (!bkey_is_inode(k.k)) - continue; - ret = bch2_inode_unpack(k, inode); - goto found; - } - ret = -BCH_ERR_ENOENT_inode; -found: - bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, struct bch_inode_unpacked *inode) { @@ -179,32 +149,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, return 0; } -static int __remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - - ret = lookup_first_inode(trans, pos.inode, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch2_hash_info_init(c, &dir_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -err: - bch_err_fn(c, ret); - return ret; -} - /* * Find any subvolume associated with a tree of snapshots * We can't rely on master_subvol - it might have been deleted. @@ -548,7 +492,7 @@ static int remove_backpointer(struct btree_trans *trans, SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); int ret = bkey_err(d) ?: dirent_points_to_inode(c, d, inode) ?: - __remove_dirent(trans, d.k->p); + bch2_fsck_remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1985,169 +1929,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa trans_was_restarted(trans, restart_count); } -noinline_for_stack -static int check_dirent_inode_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; - int ret = 0; - - if (inode_points_to_dirent(target, d)) - return 0; - - if (!target->bi_dir && - !target->bi_dir_offset) { - fsck_err_on(S_ISDIR(target->bi_mode), - trans, inode_dir_missing_backpointer, - "directory with missing backpointer\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - fsck_err_on(target->bi_flags & BCH_INODE_unlinked, - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - target->bi_flags &= ~BCH_INODE_unlinked; - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - return __bch2_fsck_write_inode(trans, target); - } - - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - if (fsck_err_on(!backpointer_exists, - trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - goto out; - } - - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - - if (fsck_err_on(backpointer_exists && - (S_ISDIR(target->bi_mode) || - target->bi_subvol), - trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } - - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -noinline_for_stack -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = check_dirent_inode_dirent(trans, iter, d, target); - if (ret) - goto err; - - if (fsck_err_on(d.v->d_type != inode_d_type(target), - trans, dirent_d_type_wrong, - "incorrect d_type: got %s, should be %s:\n%s", - bch2_d_type_str(d.v->d_type), - bch2_d_type_str(inode_d_type(target)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = inode_d_type(target); - if (n->v.d_type == DT_SUBVOL) { - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); - } else { - n->v.d_inum = cpu_to_le64(target->bi_inum); - } - - ret = bch2_trans_update(trans, iter, &n->k_i, 0); - if (ret) - goto err; - - d = dirent_i_to_s_c(n); - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - /* find a subvolume that's a descendent of @snapshot: */ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) { @@ -2247,7 +2028,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (fsck_err(trans, dirent_to_missing_subvol, "dirent points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return __remove_dirent(trans, d.k->p); + return bch2_fsck_remove_dirent(trans, d.k->p); ret = 0; goto out; } @@ -2291,7 +2072,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto err; } - ret = check_dirent_target(trans, iter, d, &subvol_root); + ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); if (ret) goto err; out: @@ -2378,13 +2159,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = __remove_dirent(trans, d.k->p); + ret = bch2_fsck_remove_dirent(trans, d.k->p); if (ret) goto err; } darray_for_each(target->inodes, i) { - ret = check_dirent_target(trans, iter, d, &i->inode); + ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); if (ret) goto err; } diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 339b80770f1d..80051073f613 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -731,10 +731,9 @@ int bch2_trigger_inode(struct btree_trans *trans, bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { - struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; - int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); + s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; + if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); if (ret) return ret; } @@ -868,19 +867,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, uid, gid, mode, rdev, parent); } -static inline u32 bkey_generation(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - BUG(); - case KEY_TYPE_inode_generation: - return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); - default: - return 0; - } -} - static struct bkey_i_inode_alloc_cursor * bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { @@ -1092,7 +1078,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } @@ -1256,7 +1242,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 428b9be6af34..f82cfbf460d0 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -277,6 +277,7 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; return S_ISDIR(inode->bi_mode) || + inode->bi_subvol || (!inode->bi_nlink && inode_has_bp); } diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index b99a5bf1a75e..117110af1e3f 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -137,7 +137,8 @@ enum inode_opt_id { x(i_sectors_dirty, 6) \ x(unlinked, 7) \ x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) + x(has_child_snapshot, 9) \ + x(casefolded, 10) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 5353979117b0..6b842c8d21be 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -115,7 +115,8 @@ err: bch2_increment_clock(c, sectors_allocated, WRITE); if (should_print_err(ret)) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index aa91fcf51eec..f1503df57dc7 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -25,8 +25,15 @@ #include "subvolume.h" #include "trace.h" +#include <linux/random.h> #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_read_corrupt_ratio; +module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(read_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -80,6 +87,7 @@ struct promote_op { struct rhash_head hash; struct bpos pos; + struct work_struct work; struct data_update write; struct bio_vec bi_inline_vecs[]; /* must be last */ }; @@ -96,6 +104,33 @@ static inline bool have_io_error(struct bch_io_failures *failed) return failed && failed->nr; } +static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) +{ + EBUG_ON(rbio->split); + + return rbio->data_update + ? container_of(rbio, struct data_update, rbio) + : NULL; +} + +static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) +{ + struct data_update *u = rbio_data_update(orig); + if (!u) + return false; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev && + u->data_opts.rewrite_ptrs & BIT(i)) + return true; + i++; + } + + return false; +} + static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, struct bch_io_opts opts, @@ -105,7 +140,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, if (!have_io_error(failed)) { BUG_ON(!opts.promote_target); - if (!(flags & BCH_READ_MAY_PROMOTE)) + if (!(flags & BCH_READ_may_promote)) return -BCH_ERR_nopromote_may_not; if (bch2_bkey_has_target(c, k, opts.promote_target)) @@ -125,98 +160,93 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, return 0; } -static void promote_free(struct bch_fs *c, struct promote_op *op) +static noinline void promote_free(struct bch_read_bio *rbio) { - int ret; + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); + struct bch_fs *c = rbio->c; + + int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); bch2_data_update_exit(&op->write); - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); bch2_write_ref_put(c, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } static void promote_done(struct bch_write_op *wop) { - struct promote_op *op = - container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.op.c; + struct promote_op *op = container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.rbio.c; - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], - op->start_time); - promote_free(c, op); + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); + promote_free(&op->write.rbio); } -static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +static void promote_start_work(struct work_struct *work) { - struct bio *bio = &op->write.op.wbio.bio; + struct promote_op *op = container_of(work, struct promote_op, work); - trace_and_count(op->write.op.c, read_promote, &rbio->bio); + bch2_data_update_read_done(&op->write); +} - /* we now own pages: */ - BUG_ON(!rbio->bounce); - BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); +static noinline void promote_start(struct bch_read_bio *rbio) +{ + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); - bch2_data_update_read_done(&op->write, rbio->pick.crc); + INIT_WORK(&op->work, promote_start_work); + queue_work(rbio->c->write_ref_wq, &op->work); } -static struct promote_op *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned sectors, - struct bch_read_bio **rbio, - struct bch_io_failures *failed) +static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + unsigned sectors, + struct bch_read_bio *orig, + struct bch_io_failures *failed) { struct bch_fs *c = trans->c; - struct promote_op *op = NULL; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) - return ERR_PTR(-BCH_ERR_nopromote_no_writes); + struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; - op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); - if (!op) { - ret = -BCH_ERR_nopromote_enomem; - goto err; - } + if (!have_io_error(failed)) { + update_opts.target = orig->opts.promote_target; + update_opts.extra_replicas = 1; + update_opts.write_flags |= BCH_WRITE_cached; + update_opts.write_flags |= BCH_WRITE_only_specified_devs; + } else { + update_opts.target = orig->opts.foreground_target; - op->start_time = local_clock(); - op->pos = pos; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned ptr_bit = 1; + bkey_for_each_ptr(ptrs, ptr) { + if (bch2_dev_io_failures(failed, ptr->dev) && + !ptr_being_rewritten(orig, ptr->dev)) + update_opts.rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } - /* - * We don't use the mempool here because extents that aren't - * checksummed or compressed can be too big for the mempool: - */ - *rbio = kzalloc(sizeof(struct bch_read_bio) + - sizeof(struct bio_vec) * pages, - GFP_KERNEL); - if (!*rbio) { - ret = -BCH_ERR_nopromote_enomem; - goto err; + if (!update_opts.rewrite_ptrs) + return NULL; } - rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + return ERR_PTR(-BCH_ERR_nopromote_no_writes); - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { + struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) { ret = -BCH_ERR_nopromote_enomem; - goto err; + goto err_put; } - (*rbio)->bounce = true; - (*rbio)->split = true; - (*rbio)->kmalloc = true; + op->start_time = local_clock(); + op->pos = pos; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { @@ -224,64 +254,43 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, goto err; } - bio = &op->write.op.wbio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - - struct data_update_opts update_opts = {}; - - if (!have_io_error(failed)) { - update_opts.target = opts.promote_target; - update_opts.extra_replicas = 1; - update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; - } else { - update_opts.target = opts.foreground_target; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev)) - update_opts.rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - } - ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), - opts, + &orig->opts, update_opts, btree_id, k); /* * possible errors: -BCH_ERR_nocow_lock_blocked, * -BCH_ERR_ENOSPC_disk_reservation: */ - if (ret) { - BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params)); - goto err; - } + if (ret) + goto err_remove_hash; + rbio_init_fragment(&op->write.rbio.bio, orig); + op->write.rbio.bounce = true; + op->write.rbio.promote = true; op->write.op.end_io = promote_done; - return op; + return &op->write.rbio; +err_remove_hash: + BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params)); err: - if (*rbio) - bio_free_pages(&(*rbio)->bio); - kfree(*rbio); - *rbio = NULL; + bio_free_pages(&op->write.op.wbio.bio); /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); +err_put: bch2_write_ref_put(c, BCH_WRITE_REF_promote); return ERR_PTR(ret); } noinline -static struct promote_op *promote_alloc(struct btree_trans *trans, +static struct bch_read_bio *promote_alloc(struct btree_trans *trans, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, - struct bch_io_opts opts, unsigned flags, - struct bch_read_bio **rbio, + struct bch_read_bio *orig, bool *bounce, bool *read_full, struct bch_io_failures *failed) @@ -301,18 +310,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, struct bpos pos = promote_full ? bkey_start_pos(k.k) : POS(k.k->p.inode, iter.bi_sector); - struct promote_op *promote; int ret; - ret = should_promote(c, k, pos, opts, flags, failed); + ret = should_promote(c, k, pos, orig->opts, flags, failed); if (ret) goto nopromote; - promote = __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio, failed); + struct bch_read_bio *promote = + __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, sectors, orig, failed); + if (!promote) + return NULL; + ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; @@ -321,7 +333,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, *read_full = promote_full; return promote; nopromote: - trace_read_nopromote(c, ret); + trace_io_read_nopromote(c, ret); return NULL; } @@ -330,9 +342,17 @@ nopromote: static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { - return bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { rbio->subvol, read_pos.inode }, - read_pos.offset << 9); + int ret = lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { rbio->subvol, read_pos.inode }, + read_pos.offset << 9)); + if (ret) + return ret; + + if (rbio->data_update) + prt_str(out, "(internal move) "); + + return 0; } static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, @@ -341,10 +361,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); } -#define READ_RETRY_AVOID 1 -#define READ_RETRY 2 -#define READ_ERR 3 - enum rbio_context { RBIO_CONTEXT_NULL, RBIO_CONTEXT_HIGHPRI, @@ -375,20 +391,25 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { BUG_ON(rbio->bounce && !rbio->split); - if (rbio->promote) - promote_free(rbio->c, rbio->promote); - rbio->promote = NULL; - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + if (rbio->have_ioref) { + struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); + percpu_ref_put(&ca->io_ref); + } if (rbio->split) { struct bch_read_bio *parent = rbio->parent; - if (rbio->kmalloc) - kfree(rbio); - else + if (unlikely(rbio->promote)) { + if (!rbio->bio.bi_status) + promote_start(rbio); + else + promote_free(rbio); + } else { + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + bio_put(&rbio->bio); + } rbio = parent; } @@ -408,61 +429,49 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) +static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - bch2_bkey_buf_init(&sk); - - bch2_trans_iter_init(trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_slots); + struct data_update *u = container_of(rbio, struct data_update, rbio); retry: bch2_trans_begin(trans); - rbio->bio.bi_status = 0; - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + u->btree_id, bkey_start_pos(&u->k.k->k), + 0))); if (ret) goto err; - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - if (!bch2_bkey_matches_ptr(c, k, - rbio->pick.ptr, - rbio->data_pos.offset - - rbio->pick.crc.offset)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->hole = true; - goto out; + rbio->ret = -BCH_ERR_data_read_key_overwritten; + goto err; } ret = __bch2_read_extent(trans, rbio, bvec_iter, - rbio->read_pos, - rbio->data_btree, - k, 0, failed, flags); - if (ret == READ_RETRY) - goto retry; - if (ret) - goto err; -out: - bch2_rbio_done(rbio); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); - return; + bkey_start_pos(&u->k.k->k), + u->btree_id, + bkey_i_to_s_c(u->k.k), + 0, failed, flags, -1); err: - rbio->bio.bi_status = BLK_STS_IOERR; - goto out; + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) + goto retry; + + if (ret) { + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + } + + BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); + return ret; } static void bch2_rbio_retry(struct work_struct *work) @@ -477,45 +486,80 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; + struct btree_trans *trans = bch2_trans_get(c); - trace_and_count(c, read_retry, &rbio->bio); + trace_io_read_retry(&rbio->bio); + this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], + bvec_iter_sectors(rbio->bvec_iter)); - if (rbio->retry == READ_RETRY_AVOID) - bch2_mark_io_failure(&failed, &rbio->pick); + if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + bch2_mark_io_failure(&failed, &rbio->pick, + rbio->ret == -BCH_ERR_data_read_retry_csum_err); + + if (!rbio->split) { + rbio->bio.bi_status = 0; + rbio->ret = 0; + } - rbio->bio.bi_status = 0; + unsigned subvol = rbio->subvol; + struct bpos read_pos = rbio->read_pos; rbio = bch2_rbio_free(rbio); - flags |= BCH_READ_IN_RETRY; - flags &= ~BCH_READ_MAY_PROMOTE; + flags |= BCH_READ_in_retry; + flags &= ~BCH_READ_may_promote; + flags &= ~BCH_READ_last_fragment; + flags |= BCH_READ_must_clone; - if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + int ret = rbio->data_update + ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) + : __bch2_read(trans, rbio, iter, inum, &failed, flags); + + if (ret) { + rbio->ret = ret; + rbio->bio.bi_status = BLK_STS_IOERR; } else { - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; + struct printbuf buf = PRINTBUF; + + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, + (subvol_inum) { subvol, read_pos.inode }, + read_pos.offset << 9)); + if (rbio->data_update) + prt_str(&buf, "(internal move) "); + prt_str(&buf, "successful retry"); - __bch2_read(c, rbio, iter, inum, &failed, flags); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } + + bch2_rbio_done(rbio); + bch2_trans_put(trans); } -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) +static void bch2_rbio_error(struct bch_read_bio *rbio, + int ret, blk_status_t blk_error) { - rbio->retry = retry; + BUG_ON(ret >= 0); - if (rbio->flags & BCH_READ_IN_RETRY) + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + + bch2_rbio_parent(rbio)->saw_error = true; + + if (rbio->flags & BCH_READ_in_retry) return; - if (retry == READ_ERR) { + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } else { rbio = bch2_rbio_free(rbio); - rbio->bio.bi_status = error; + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + bch2_rbio_done(rbio); - } else { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); } } @@ -531,15 +575,13 @@ static void bch2_read_io_err(struct work_struct *work) bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_read); + if (ca) bch_err_ratelimited(ca, "%s", buf.buf); - } else { + else bch_err_ratelimited(c, "%s", buf.buf); - } printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, @@ -621,14 +663,12 @@ static void bch2_read_csum_err(struct work_struct *work) bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + if (ca) bch_err_ratelimited(ca, "%s", buf.buf); - } else { + else bch_err_ratelimited(c, "%s", buf.buf); - } - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -648,7 +688,7 @@ static void bch2_read_decompress_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -668,7 +708,7 @@ static void bch2_read_decrypt_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -678,9 +718,11 @@ static void __bch2_read_endio(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + struct bch_read_bio *parent = bch2_rbio_parent(rbio); + struct bio *src = &rbio->bio; + struct bio *dst = &parent->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); unsigned nofs_flags; @@ -698,8 +740,26 @@ static void __bch2_read_endio(struct work_struct *work) src->bi_iter = rbio->bvec_iter; } + bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; + + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { + rbio->flags |= BCH_READ_must_bounce; + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, + BLK_STS_IOERR); + goto out; + } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) goto csum_err; /* @@ -712,32 +772,40 @@ static void __bch2_read_endio(struct work_struct *work) if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); - if (rbio->flags & BCH_READ_NODECODE) - goto nodecode; + if (likely(!parent->data_update)) { + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; - } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } + } else { + if (rbio->split) + rbio->parent->pick = rbio->pick; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; @@ -754,12 +822,9 @@ static void __bch2_read_endio(struct work_struct *work) ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err; - - promote_start(rbio->promote, rbio); - rbio->promote = NULL; } -nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + + if (likely(!(rbio->flags & BCH_READ_in_retry))) { rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } @@ -767,17 +832,6 @@ out: memalloc_nofs_restore(nofs_flags); return; csum_err: - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_MUST_BOUNCE; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - goto out; - } - bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decompression_err: @@ -797,10 +851,8 @@ static void bch2_read_endio(struct bio *bio) struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - if (rbio->have_ioref) { - bch2_latency_acct(ca, rbio->submit_time, READ); - percpu_ref_put(&ca->io_ref); - } + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rbio->submit_time, !bio->bi_status); if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; @@ -810,14 +862,14 @@ static void bch2_read_endio(struct bio *bio) return; } - if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { - trace_and_count(c, read_reuse_race, &rbio->bio); + trace_and_count(c, io_read_reuse_race, &rbio->bio); - if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + if (rbio->flags & BCH_READ_retry_if_stale) + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); return; } @@ -883,15 +935,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, unsigned flags, int dev) { struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; + struct data_update *u = rbio_data_update(orig); + int ret = 0; if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, @@ -902,19 +954,21 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, swap(iter.bi_size, bytes); bio_advance_iter(&orig->bio, &iter, bytes); zero_fill_bio_iter(&orig->bio, iter); + this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], + bvec_iter_sectors(iter)); goto out_read_done; } retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); /* hole or reservation - just zero fill: */ - if (!pick_ret) + if (!ret) goto hole; - if (unlikely(pick_ret < 0)) { + if (unlikely(ret < 0)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); + prt_printf(&buf, "%s\n ", bch2_err_str(ret)); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); @@ -930,6 +984,7 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); + ret = -BCH_ERR_data_read_no_encryption_key; goto err; } @@ -941,56 +996,57 @@ retry_pick: * retry path, don't check here, it'll be caught in bch2_read_endio() * and we'll end up in the retry path: */ - if ((flags & BCH_READ_IN_RETRY) && + if ((flags & BCH_READ_in_retry) && !pick.ptr.cached && ca && unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); - bch2_mark_io_failure(failed, &pick); + bch2_mark_io_failure(failed, &pick, false); percpu_ref_put(&ca->io_ref); goto retry_pick; } - if (flags & BCH_READ_NODECODE) { + if (likely(!u)) { + if (!(flags & BCH_READ_last_fragment) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_must_clone; + + narrow_crcs = !(flags & BCH_READ_in_retry) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_user_mapped)) + flags |= BCH_READ_must_bounce; + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_user_mapped)) || + (flags & BCH_READ_must_bounce)))) { + read_full = true; + bounce = true; + } + } else { /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { + if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) percpu_ref_put(&ca->io_ref); - goto hole; + rbio->ret = -BCH_ERR_data_read_buffer_too_small; + goto out_read_done; } iter.bi_size = pick.crc.compressed_size << 9; - goto get_bio; - } - - if (!(flags & BCH_READ_LAST_FRAGMENT) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_MUST_CLONE; - - narrow_crcs = !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) - flags |= BCH_READ_MUST_BOUNCE; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_MUST_BOUNCE)))) { read_full = true; - bounce = true; } if (orig->opts.promote_target || have_io_error(failed)) - promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full, failed); + rbio = promote_alloc(trans, iter, k, &pick, flags, orig, + &bounce, &read_full, failed); if (!read_full) { EBUG_ON(crc_is_compressed(pick.crc)); @@ -1009,7 +1065,7 @@ retry_pick: pick.crc.offset = 0; pick.crc.live_size = bvec_iter_sectors(iter); } -get_bio: + if (rbio) { /* * promote already allocated bounce rbio: @@ -1024,17 +1080,16 @@ get_bio: } else if (bounce) { unsigned sectors = pick.crc.compressed_size; - rbio = rbio_init(bio_alloc_bioset(NULL, + rbio = rbio_init_fragment(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), 0, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); rbio->bounce = true; - rbio->split = true; - } else if (flags & BCH_READ_MUST_CLONE) { + } else if (flags & BCH_READ_must_clone) { /* * Have to clone if there were any splits, due to error * reporting issues (if a split errored, and retrying didn't @@ -1043,11 +1098,10 @@ get_bio: * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); rbio->bio.bi_iter = iter; - rbio->split = true; } else { rbio = orig; rbio->bio.bi_iter = iter; @@ -1056,67 +1110,60 @@ get_bio: EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - rbio->c = c; rbio->submit_time = local_clock(); - if (rbio->split) - rbio->parent = orig; - else + if (!rbio->split) rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; - rbio->hole = 0; - rbio->retry = 0; + rbio->ret = 0; rbio->context = 0; - /* XXX: only initialize this if needed */ - rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; rbio->subvol = orig->subvol; rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; rbio->version = k.k->bversion; - rbio->promote = promote; INIT_WORK(&rbio->work, NULL); - if (flags & BCH_READ_NODECODE) - orig->pick = pick; - rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; if (rbio->bounce) - trace_and_count(c, read_bounce, &rbio->bio); + trace_and_count(c, io_read_bounce, &rbio->bio); - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + if (!u) + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + else + this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); /* * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + if (ca && pick.ptr.cached && !u) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); - if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { bio_inc_remaining(&orig->bio); - trace_and_count(c, read_split, &orig->bio); + trace_and_count(c, io_read_split, &orig->bio); } /* * Unlock the iterator while the btree node's lock is still in * cache, before doing the IO: */ - if (!(flags & BCH_READ_IN_RETRY)) + if (!(flags & BCH_READ_in_retry)) bch2_trans_unlock(trans); else bch2_trans_unlock_long(trans); - if (!rbio->pick.idx) { + if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); @@ -1126,7 +1173,9 @@ get_bio: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, + -BCH_ERR_data_read_retry_device_offline, + BLK_STS_IOERR); goto out; } @@ -1135,10 +1184,10 @@ get_bio: bio_set_dev(&rbio->bio, ca->disk_sb.bdev); if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } else { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) submit_bio(&rbio->bio); else submit_bio_wait(&rbio->bio); @@ -1152,15 +1201,16 @@ get_bio: } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, + BLK_STS_IOERR); goto out; } - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } out: - if (likely(!(flags & BCH_READ_IN_RETRY))) { + if (likely(!(flags & BCH_READ_in_retry))) { return 0; } else { bch2_trans_unlock(trans); @@ -1170,54 +1220,54 @@ out: rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); - ret = rbio->retry; + ret = rbio->ret; rbio = bch2_rbio_free(rbio); - if (ret == READ_RETRY_AVOID) { - bch2_mark_io_failure(failed, &pick); - ret = READ_RETRY; - } - - if (!ret) - goto out_read_done; + if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) + bch2_mark_io_failure(failed, &pick, + ret == -BCH_ERR_data_read_retry_csum_err); return ret; } err: - if (flags & BCH_READ_IN_RETRY) - return READ_ERR; + if (flags & BCH_READ_in_retry) + return ret; - orig->bio.bi_status = BLK_STS_IOERR; + orig->bio.bi_status = BLK_STS_IOERR; + orig->ret = ret; goto out_read_done; hole: + this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], + bvec_iter_sectors(iter)); /* - * won't normally happen in the BCH_READ_NODECODE - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: + * won't normally happen in the data update (bch2_move_extent()) path, + * but if we retry and the extent we wanted to read no longer exists we + * have to signal that: */ - if (flags & BCH_READ_NODECODE) - orig->hole = true; + if (u) + orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); out_read_done: - if (flags & BCH_READ_LAST_FRAGMENT) + if ((flags & BCH_READ_last_fragment) && + !(flags & BCH_READ_in_retry)) bch2_rbio_done(orig); return 0; } -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) +int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) { - struct btree_trans *trans = bch2_trans_get(c); + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; int ret; - BUG_ON(flags & BCH_READ_NODECODE); + EBUG_ON(rbio->data_update); bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, @@ -1267,24 +1317,26 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, data_btree, k, - offset_into_extent, failed, flags); + offset_into_extent, failed, flags, -1); if (ret) goto err; - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); err: + if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) + flags |= BCH_READ_must_bounce; + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - ret != READ_RETRY && - ret != READ_RETRY_AVOID) + !bch2_err_matches(ret, BCH_ERR_data_read_retry)) break; } @@ -1292,17 +1344,22 @@ err: if (ret) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); - prt_printf(&buf, "read error %i from btree lookup", ret); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, + bvec_iter.bi_sector << 9)); + prt_printf(&buf, "read error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - rbio->bio.bi_status = BLK_STS_IOERR; - bch2_rbio_done(rbio); + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + + if (!(flags & BCH_READ_in_retry)) + bch2_rbio_done(rbio); } - bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); + return ret; } void bch2_fs_io_read_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index a82e8a94ccb6..cd21950417f6 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -3,6 +3,7 @@ #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" +#include "btree_iter.h" #include "reflink.h" struct bch_read_bio { @@ -35,19 +36,18 @@ struct bch_read_bio { u16 flags; union { struct { - u16 bounce:1, + u16 data_update:1, + promote:1, + bounce:1, split:1, - kmalloc:1, have_ioref:1, narrow_crcs:1, - hole:1, - retry:2, + saw_error:1, context:2; }; u16 _state; }; - - struct bch_devs_list devs_have; + s16 ret; struct extent_ptr_decoded pick; @@ -65,8 +65,6 @@ struct bch_read_bio { struct bpos data_pos; struct bversion version; - struct promote_op *promote; - struct bch_io_opts opts; struct work_struct work; @@ -108,23 +106,31 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return 0; } +#define BCH_READ_FLAGS() \ + x(retry_if_stale) \ + x(may_promote) \ + x(user_mapped) \ + x(last_fragment) \ + x(must_bounce) \ + x(must_clone) \ + x(in_retry) + +enum __bch_read_flags { +#define x(n) __BCH_READ_##n, + BCH_READ_FLAGS() +#undef x +}; + enum bch_read_flags { - BCH_READ_RETRY_IF_STALE = 1 << 0, - BCH_READ_MAY_PROMOTE = 1 << 1, - BCH_READ_USER_MAPPED = 1 << 2, - BCH_READ_NODECODE = 1 << 3, - BCH_READ_LAST_FRAGMENT = 1 << 4, - - /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 5, - BCH_READ_MUST_CLONE = 1 << 6, - BCH_READ_IN_RETRY = 1 << 7, +#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), + BCH_READ_FLAGS() +#undef x }; int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, struct bpos, enum btree_id, struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned); + struct bch_io_failures *, unsigned, int); static inline void bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bpos read_pos, @@ -132,37 +138,55 @@ static inline void bch2_read_extent(struct btree_trans *trans, unsigned offset_into_extent, unsigned flags) { __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags); + data_btree, k, offset_into_extent, NULL, flags, -1); } -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); +int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) { - struct bch_io_failures failed = { .nr = 0 }; - BUG_ON(rbio->_state); - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); + bch2_trans_run(c, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, + BCH_READ_retry_if_stale| + BCH_READ_may_promote| + BCH_READ_user_mapped)); +} + +static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, + struct bch_read_bio *orig) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->c = orig->c; + rbio->_state = 0; + rbio->flags = 0; + rbio->ret = 0; + rbio->split = true; + rbio->parent = orig; + rbio->opts = orig->opts; + return rbio; } static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_io_opts opts) + struct bch_fs *c, + struct bch_io_opts opts, + bio_end_io_t end_io) { struct bch_read_bio *rbio = to_rbio(bio); - rbio->_state = 0; - rbio->promote = NULL; - rbio->opts = opts; + rbio->start_time = local_clock(); + rbio->c = c; + rbio->_state = 0; + rbio->flags = 0; + rbio->ret = 0; + rbio->opts = opts; + rbio->bio.bi_end_io = end_io; return rbio; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 03892388832b..29671075e3f1 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -34,6 +34,12 @@ #include <linux/random.h> #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_write_corrupt_ratio; +module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(write_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, @@ -374,7 +380,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_CHECK_ENOSPC); + op->flags & BCH_WRITE_check_enospc); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -396,29 +402,42 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ -static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, - u64 offset) +void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) { - bch2_inum_offset_err_msg(op->c, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); -} + struct printbuf buf = PRINTBUF; -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) -{ - __bch2_write_op_error(out, op, op->pos.offset); + if (op->subvol) { + bch2_inum_offset_err_msg(op->c, &buf, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + } else { + struct bpos pos = op->pos; + pos.offset = offset; + bch2_inum_snap_offset_err_msg(op->c, &buf, pos); + } + + prt_str(&buf, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(&buf, "\n from internal move "); + bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); + } + + bch_err_ratelimited(op->c, "%s", buf.buf); + printbuf_exit(&buf); } -static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64 offset) +static void bch2_write_csum_err_msg(struct bch_write_op *op) { - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); + bch2_write_op_error(op, op->pos.offset, + "error verifying existing checksum while rewriting existing data (memory corruption?)"); } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -493,7 +512,7 @@ static void bch2_write_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); bch2_disk_reservation_put(c, &op->res); - if (!(op->flags & BCH_WRITE_MOVE)) + if (!(op->flags & BCH_WRITE_move)) bch2_write_ref_put(c, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); @@ -516,7 +535,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -EIO; + return -BCH_ERR_data_write_io; } if (dst != src) @@ -539,7 +558,7 @@ static void __bch2_write_index(struct bch_write_op *op) unsigned dev; int ret = 0; - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + if (unlikely(op->flags & BCH_WRITE_io_error)) { ret = bch2_write_drop_io_error_ptrs(op); if (ret) goto err; @@ -548,7 +567,7 @@ static void __bch2_write_index(struct bch_write_op *op) if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); - ret = !(op->flags & BCH_WRITE_MOVE) + ret = !(op->flags & BCH_WRITE_move) ? bch2_write_index_default(op) : bch2_data_update_index_update(op); @@ -560,11 +579,8 @@ static void __bch2_write_index(struct bch_write_op *op) if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); } if (ret) @@ -573,21 +589,29 @@ static void __bch2_write_index(struct bch_write_op *op) out: /* If some a bucket wasn't written, we can't erasure code it: */ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev); + bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); bch2_open_buckets_put(c, &op->open_buckets); return; err: keys->top = keys->keys; op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; goto out; } static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) { if (state != wp->state) { + struct task_struct *p = current; u64 now = ktime_get_ns(); + u64 runtime = p->se.sum_exec_runtime + + (now - p->se.exec_start); + + if (state == WRITE_POINT_runnable) + wp->last_runtime = runtime; + else if (wp->state == WRITE_POINT_runnable) + wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; if (wp->last_state_change && time_after64(now, wp->last_state_change)) @@ -601,7 +625,7 @@ static inline void wp_update_state(struct write_point *wp, bool running) { enum write_point_state state; - state = running ? WRITE_POINT_running : + state = running ? WRITE_POINT_runnable: !list_empty(&wp->writes) ? WRITE_POINT_waiting_io : WRITE_POINT_stopped; @@ -615,8 +639,8 @@ static CLOSURE_CALLBACK(bch2_write_index) struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; - if ((op->flags & BCH_WRITE_SUBMITTED) && - (op->flags & BCH_WRITE_MOVE)) + if ((op->flags & BCH_WRITE_submitted) && + (op->flags & BCH_WRITE_move)) bch2_bio_free_pages_pool(op->c, &op->wbio.bio); spin_lock_irqsave(&wp->writes_lock, flags); @@ -654,11 +678,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work) if (!op) break; - op->flags |= BCH_WRITE_IN_WORKER; + op->flags |= BCH_WRITE_in_worker; __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) __bch2_write(op); else bch2_write_done(&op->cl); @@ -676,13 +700,17 @@ static void bch2_write_endio(struct bio *bio) ? bch2_dev_have_ref(c, wbio->dev) : NULL; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_inum_offset_ratelimited(ca, op->pos.inode, wbio->inode_offset << 9, "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) { + bch2_blk_status_to_str(bio->bi_status)); set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_IO_ERROR; + op->flags |= BCH_WRITE_io_error; } if (wbio->nocow) { @@ -692,10 +720,8 @@ static void bch2_write_endio(struct bio *bio) set_bit(wbio->dev, op->devs_need_flush->d); } - if (wbio->have_ioref) { - bch2_latency_acct(ca, wbio->submit_time, WRITE); + if (wbio->have_ioref) percpu_ref_put(&ca->io_ref); - } if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -729,7 +755,7 @@ static void init_append_extent(struct bch_write_op *op, bch2_extent_crc_append(&e->k_i, crc); bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_CACHED); + op->flags & BCH_WRITE_cached); bch2_keylist_push(&op->insert_keys); } @@ -789,7 +815,6 @@ static int bch2_write_rechecksum(struct bch_fs *c, { struct bio *bio = &op->wbio.bio; struct bch_extent_crc_unpacked new_crc; - int ret; /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ @@ -797,10 +822,10 @@ static int bch2_write_rechecksum(struct bch_fs *c, bch2_csum_type_is_encryption(new_csum_type)) new_csum_type = op->crc.csum_type; - ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); + int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); if (ret) return ret; @@ -810,44 +835,12 @@ static int bch2_write_rechecksum(struct bch_fs *c, return 0; } -static int bch2_write_decrypt(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct nonce nonce = extent_nonce(op->version, op->crc); - struct bch_csum csum; - int ret; - - if (!bch2_csum_type_is_encryption(op->crc.csum_type)) - return 0; - - /* - * If we need to decrypt data in the write path, we'll no longer be able - * to verify the existing checksum (poly1305 mac, in this case) after - * it's decrypted - this is the last point we'll be able to reverify the - * checksum: - */ - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return -EIO; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - return ret; -} - -static enum prep_encoded_ret { - PREP_ENCODED_OK, - PREP_ENCODED_ERR, - PREP_ENCODED_CHECKSUM_ERR, - PREP_ENCODED_DO_WRITE, -} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; - - if (!(op->flags & BCH_WRITE_DATA_ENCODED)) - return PREP_ENCODED_OK; + struct nonce nonce = extent_nonce(op->version, op->crc); + int ret = 0; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); @@ -858,12 +851,13 @@ static enum prep_encoded_ret { (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + op->csum_type != op->crc.csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } - return PREP_ENCODED_DO_WRITE; + return 1; } /* @@ -871,20 +865,23 @@ static enum prep_encoded_ret { * is, we have to decompress it: */ if (crc_is_compressed(op->crc)) { - struct bch_csum csum; - - if (bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; - /* Last point we can still verify checksum: */ - csum = bch2_checksum_bio(c, op->crc.csum_type, - extent_nonce(op->version, op->crc), - bio); + struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + goto csum_err; + + if (bch2_csum_type_is_encryption(op->crc.csum_type)) { + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; - if (bch2_bio_uncompress_inplace(op, bio)) - return PREP_ENCODED_ERR; + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } + + ret = bch2_bio_uncompress_inplace(op, bio); + if (ret) + return ret; } /* @@ -896,22 +893,34 @@ static enum prep_encoded_ret { * If the data is checksummed and we're only writing a subset, * rechecksum and adjust bio to point to currently live data: */ - if ((op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + if (op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } /* * If we want to compress the data, it has to be decrypted: */ - if ((op->compression_opt || - bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(op->csum_type)) && - bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; + if (bch2_csum_type_is_encryption(op->crc.csum_type) && + (op->compression_opt || op->crc.csum_type != op->csum_type)) { + struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + goto csum_err; + + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; - return PREP_ENCODED_OK; + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } + + return 0; +csum_err: + bch2_write_csum_err_msg(op); + return -BCH_ERR_data_write_csum; } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -930,39 +939,44 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ec_buf = bch2_writepoint_ec_buf(c, wp); - switch (bch2_write_prep_encoded_data(op, wp)) { - case PREP_ENCODED_OK: - break; - case PREP_ENCODED_ERR: - ret = -EIO; - goto err; - case PREP_ENCODED_CHECKSUM_ERR: - goto csum_err; - case PREP_ENCODED_DO_WRITE: - /* XXX look for bug here */ - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; + if (unlikely(op->flags & BCH_WRITE_data_encoded)) { + ret = bch2_write_prep_encoded_data(op, wp); + if (ret < 0) + goto err; + if (ret) { + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } + init_append_extent(op, wp, op->version, op->crc); + goto do_write; } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; } if (ec_buf || op->compression_opt || (op->csum_type && - !(op->flags & BCH_WRITE_PAGES_STABLE)) || + !(op->flags & BCH_WRITE_pages_stable)) || (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_PAGES_OWNED))) { + !(op->flags & BCH_WRITE_pages_owned))) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, ec_buf); bounce = true; } +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); + if (!bounce && write_corrupt_ratio) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } +#endif saved_iter = dst->bi_iter; do { @@ -976,7 +990,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, break; BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_DATA_ENCODED) && + (op->flags & BCH_WRITE_data_encoded) && bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_opt && !bounce); @@ -1014,7 +1028,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } } - if ((op->flags & BCH_WRITE_DATA_ENCODED) && + if ((op->flags & BCH_WRITE_data_encoded) && !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { @@ -1046,7 +1060,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.compression_type = compression_type; crc.nonce = nonce; } else { - if ((op->flags & BCH_WRITE_DATA_ENCODED) && + if ((op->flags & BCH_WRITE_data_encoded) && bch2_rechecksum_bio(c, src, version, op->crc, NULL, &op->crc, src_len >> 9, @@ -1072,6 +1086,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, init_append_extent(op, wp, version, crc); +#ifdef CONFIG_BCACHEFS_DEBUG + if (write_corrupt_ratio) { + swap(dst->bi_iter.bi_size, dst_len); + bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); + swap(dst->bi_iter.bi_size, dst_len); + } +#endif + if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); @@ -1104,15 +1126,8 @@ do_write: *_dst = dst; return more; csum_err: - { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = -EIO; + bch2_write_csum_err_msg(op); + ret = -BCH_ERR_data_write_csum; err: if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); @@ -1190,39 +1205,36 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; for_each_keylist_key(&op->insert_keys, orig) { - int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - - struct printbuf buf = PRINTBUF; - bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (ret) { - op->error = ret; + if (ret) break; - } } bch2_trans_put(trans); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); + } + + if (ret) + op->error = ret; } static void __bch2_nocow_write_done(struct bch_write_op *op) { - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { - op->error = -EIO; - } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + if (unlikely(op->flags & BCH_WRITE_io_error)) { + op->error = -BCH_ERR_data_write_io; + } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1251,7 +1263,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct bucket_to_lock *stale_at; int stale, ret; - if (op->flags & BCH_WRITE_MOVE) + if (op->flags & BCH_WRITE_move) return; darray_init(&buckets); @@ -1309,7 +1321,7 @@ retry: }), GFP_KERNEL|__GFP_NOFAIL); if (ptr->unwritten) - op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + op->flags |= BCH_WRITE_convert_unwritten; } /* Unlock before taking nocow locks, doing IO: */ @@ -1317,7 +1329,7 @@ retry: bch2_trans_unlock(trans); bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + if (op->flags & BCH_WRITE_convert_unwritten) bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { @@ -1342,7 +1354,7 @@ retry: wbio_init(bio)->put_bio = true; bio->bi_opf = op->wbio.bio.bi_opf; } else { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } op->pos.offset += bio_sectors(bio); @@ -1352,11 +1364,12 @@ retry: bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, op->insert_keys.top, true); bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) break; bch2_btree_iter_advance(&iter); } @@ -1370,21 +1383,18 @@ err: darray_exit(&buckets); if (ret) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + bch2_write_op_error(op, op->pos.offset, + "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_SUBMITTED)) { + if (!(op->flags & BCH_WRITE_submitted)) { closure_sync(&op->cl); __bch2_nocow_write_done(op); op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_SYNC) { + } else if (op->flags & BCH_WRITE_sync) { closure_sync(&op->cl); bch2_nocow_write_done(&op->cl.work); } else { @@ -1414,7 +1424,7 @@ err_bucket_stale: "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_data_write_invalid_ptr; } else { /* We can retry this: */ ret = -BCH_ERR_transaction_restart; @@ -1436,7 +1446,7 @@ static void __bch2_write(struct bch_write_op *op) if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) goto out_nofs_restore; } again: @@ -1466,7 +1476,7 @@ again: ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), op->write_point, &op->devs_have, op->nr_replicas, @@ -1489,16 +1499,12 @@ again: bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } + if (!(op->flags & BCH_WRITE_alloc_nowait)) + bch2_write_op_error(op, op->pos.offset, + "%s(): %s", __func__, bch2_err_str(ret)); op->error = ret; break; } @@ -1524,14 +1530,14 @@ err: * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ - if ((op->flags & BCH_WRITE_SYNC) || - (!(op->flags & BCH_WRITE_SUBMITTED) && - !(op->flags & BCH_WRITE_IN_WORKER))) { + if ((op->flags & BCH_WRITE_sync) || + (!(op->flags & BCH_WRITE_submitted) && + !(op->flags & BCH_WRITE_in_worker))) { bch2_wait_on_allocator(c, &op->cl); __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) goto again; bch2_write_done(&op->cl); } else { @@ -1552,8 +1558,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) memset(&op->failed, 0, sizeof(op->failed)); - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_wrote_data_inline; + op->flags |= BCH_WRITE_submitted; bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); @@ -1616,8 +1622,8 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); - if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - op->flags |= BCH_WRITE_ALLOC_NOWAIT; + if (op->flags & BCH_WRITE_only_specified_devs) + op->flags |= BCH_WRITE_alloc_nowait; op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); @@ -1625,11 +1631,8 @@ CLOSURE_CALLBACK(bch2_write) wbio_init(bio)->put_bio = false; if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "misaligned write"); - printbuf_exit(&buf); - op->error = -EIO; + bch2_write_op_error(op, op->pos.offset, "misaligned write"); + op->error = -BCH_ERR_data_write_misaligned; goto err; } @@ -1638,13 +1641,14 @@ CLOSURE_CALLBACK(bch2_write) goto err; } - if (!(op->flags & BCH_WRITE_MOVE) && + if (!(op->flags & BCH_WRITE_move) && !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; } - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + if (!(op->flags & BCH_WRITE_move)) + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, @@ -1675,20 +1679,26 @@ static const char * const bch2_write_flags[] = { void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) { - prt_str(out, "pos: "); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_printf(out, "pos:\t"); bch2_bpos_to_text(out, op->pos); prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "started: "); + prt_printf(out, "started:\t"); bch2_pr_time_units(out, local_clock() - op->start_time); prt_newline(out); - prt_str(out, "flags: "); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); + prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); + + prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); printbuf_indent_sub(out, 2); } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index b4626013abc8..b8ab19a1e1da 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -11,33 +11,27 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); +__printf(3, 4) +void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); #define BCH_WRITE_FLAGS() \ - x(ALLOC_NOWAIT) \ - x(CACHED) \ - x(DATA_ENCODED) \ - x(PAGES_STABLE) \ - x(PAGES_OWNED) \ - x(ONLY_SPECIFIED_DEVS) \ - x(WROTE_DATA_INLINE) \ - x(FROM_INTERNAL) \ - x(CHECK_ENOSPC) \ - x(SYNC) \ - x(MOVE) \ - x(IN_WORKER) \ - x(SUBMITTED) \ - x(IO_ERROR) \ - x(CONVERT_UNWRITTEN) + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) enum __bch_write_flags { #define x(f) __BCH_WRITE_##f, diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index 6e878a6f2f0b..3ef6df9145ef 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -64,7 +64,7 @@ struct bch_write_op { struct bpos pos; struct bversion version; - /* For BCH_WRITE_DATA_ENCODED: */ + /* For BCH_WRITE_data_encoded: */ struct bch_extent_crc_unpacked crc; struct write_point_specifier write_point; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 05b1250619ec..bfdaea6569ae 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -20,13 +20,6 @@ #include "journal_seq_blacklist.h" #include "trace.h" -static const char * const bch2_journal_errors[] = { -#define x(n) #n, - JOURNAL_ERRORS() -#undef x - NULL -}; - static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -56,11 +49,18 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_printf(out, "seq:\t%llu\n", seq); printbuf_indent_add(out, 2); - prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); + if (!buf->write_started) + prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); - prt_printf(out, "size:\t"); - prt_human_readable_u64(out, vstruct_bytes(buf->data)); - prt_newline(out); + struct closure *cl = &buf->io; + int r = atomic_read(&cl->remaining); + prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); + + if (buf->data) { + prt_printf(out, "size:\t"); + prt_human_readable_u64(out, vstruct_bytes(buf->data)); + prt_newline(out); + } prt_printf(out, "expires:\t"); prt_printf(out, "%li jiffies\n", buf->expires - jiffies); @@ -87,6 +87,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) { + lockdep_assert_held(&j->lock); + out->atomic++; + if (!out->nr_tabstops) printbuf_tabstop_push(out, 24); @@ -95,6 +98,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq++) bch2_journal_buf_to_text(out, j, seq); prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); + + --out->atomic; } static inline struct journal_buf * @@ -104,10 +109,8 @@ journal_seq_to_buf(struct journal *j, u64 seq) EBUG_ON(seq > journal_cur_seq(j)); - if (journal_seq_unwritten(j, seq)) { + if (journal_seq_unwritten(j, seq)) buf = j->buf + (seq & JOURNAL_BUF_MASK); - EBUG_ON(le64_to_cpu(buf->data->seq) != seq); - } return buf; } @@ -139,8 +142,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) bool stuck = false; struct printbuf buf = PRINTBUF; - if (!(error == JOURNAL_ERR_journal_full || - error == JOURNAL_ERR_journal_pin_full) || + if (!(error == -BCH_ERR_journal_full || + error == -BCH_ERR_journal_pin_full) || nr_unwritten_journal_entries(j) || (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) return stuck; @@ -167,7 +170,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) spin_unlock(&j->lock); bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", - bch2_journal_errors[error]); + bch2_err_str(error)); bch2_journal_debug_to_text(&buf, j); bch_err(c, "%s", buf.buf); @@ -195,7 +198,8 @@ void bch2_journal_do_writes(struct journal *j) if (w->write_started) continue; - if (!journal_state_count(j->reservations, idx)) { + if (!journal_state_seq_count(j, j->reservations, seq)) { + j->seq_write_started = seq; w->write_started = true; closure_call(&w->io, bch2_journal_write, j->wq, NULL); } @@ -306,7 +310,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t bch2_journal_space_available(j); - __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); + __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); } void bch2_journal_halt(struct journal *j) @@ -377,29 +381,41 @@ static int journal_entry_open(struct journal *j) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return JOURNAL_ERR_blocked; + return -BCH_ERR_journal_blocked; if (j->cur_entry_error) return j->cur_entry_error; - if (bch2_journal_error(j)) - return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + int ret = bch2_journal_error(j); + if (unlikely(ret)) + return ret; if (!fifo_free(&j->pin)) - return JOURNAL_ERR_journal_pin_full; + return -BCH_ERR_journal_pin_full; if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return JOURNAL_ERR_max_in_flight; + return -BCH_ERR_journal_max_in_flight; + + if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) + return -BCH_ERR_journal_max_open; if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + return -BCH_ERR_journal_shutdown; } + if (!j->free_buf && !buf->data) + return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ + BUG_ON(!j->cur_entry_sectors); + if (!buf->data) { + swap(buf->data, j->free_buf); + swap(buf->buf_size, j->free_buf_size); + } + buf->expires = (journal_cur_seq(j) == j->flushed_seq_ondisk ? jiffies @@ -415,7 +431,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= (ssize_t) j->early_journal_entries.nr) - return JOURNAL_ERR_journal_full; + return -BCH_ERR_journal_full; if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); @@ -464,7 +480,7 @@ static int journal_entry_open(struct journal *j) new.idx++; BUG_ON(journal_state_count(new, new.idx)); - BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); + BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); journal_state_inc(&new); @@ -514,6 +530,33 @@ static void journal_write_work(struct work_struct *work) spin_unlock(&j->lock); } +static void journal_buf_prealloc(struct journal *j) +{ + if (j->free_buf && + j->free_buf_size >= j->buf_size_want) + return; + + unsigned buf_size = j->buf_size_want; + + spin_unlock(&j->lock); + void *buf = kvmalloc(buf_size, GFP_NOFS); + spin_lock(&j->lock); + + if (buf && + (!j->free_buf || + buf_size > j->free_buf_size)) { + swap(buf, j->free_buf); + swap(buf_size, j->free_buf_size); + } + + if (unlikely(buf)) { + spin_unlock(&j->lock); + /* kvfree can sleep */ + kvfree(buf); + spin_lock(&j->lock); + } +} + static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned flags) { @@ -525,25 +568,28 @@ retry: if (journal_res_get_fast(j, res, flags)) return 0; - if (bch2_journal_error(j)) - return -BCH_ERR_erofs_journal_err; + ret = bch2_journal_error(j); + if (unlikely(ret)) + return ret; if (j->blocked) - return -BCH_ERR_journal_res_get_blocked; + return -BCH_ERR_journal_blocked; if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = JOURNAL_ERR_journal_full; + ret = -BCH_ERR_journal_full; can_discard = j->can_discard; goto out; } if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = JOURNAL_ERR_max_in_flight; + ret = -BCH_ERR_journal_max_in_flight; goto out; } spin_lock(&j->lock); + journal_buf_prealloc(j); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() @@ -566,25 +612,48 @@ retry: j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; + ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); out: - if (ret == JOURNAL_ERR_retry) - goto retry; - if (!ret) + if (likely(!ret)) return 0; + if (ret == -BCH_ERR_journal_retry_open) + goto retry; if (journal_error_check_stuck(j, ret, flags)) - ret = -BCH_ERR_journal_res_get_blocked; + ret = -BCH_ERR_journal_stuck; - if (ret == JOURNAL_ERR_max_in_flight && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { + if (ret == -BCH_ERR_journal_max_in_flight && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && + trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_printbuf_make_room(&buf, 4096); + + spin_lock(&j->lock); + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + spin_unlock(&j->lock); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + + if (ret == -BCH_ERR_journal_max_open && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && + trace_journal_entry_full_enabled()) { struct printbuf buf = PRINTBUF; + + bch2_printbuf_make_room(&buf, 4096); + + spin_lock(&j->lock); prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); bch2_journal_bufs_to_text(&buf, j); + spin_unlock(&j->lock); + trace_journal_entry_full(c, buf.buf); printbuf_exit(&buf); count_event(c, journal_entry_full); @@ -594,8 +663,8 @@ out: * Journal is full - can't rely on reclaim from work item due to * freezing: */ - if ((ret == JOURNAL_ERR_journal_full || - ret == JOURNAL_ERR_journal_pin_full) && + if ((ret == -BCH_ERR_journal_full || + ret == -BCH_ERR_journal_pin_full) && !(flags & JOURNAL_RES_GET_NONBLOCK)) { if (can_discard) { bch2_journal_do_discards(j); @@ -608,9 +677,7 @@ out: } } - return ret == JOURNAL_ERR_insufficient_devices - ? -BCH_ERR_erofs_journal_err - : -BCH_ERR_journal_res_get_blocked; + return ret; } static unsigned max_dev_latency(struct bch_fs *c) @@ -640,7 +707,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, int ret; if (closure_wait_event_timeout(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), HZ)) return ret; @@ -654,7 +721,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, remaining_wait = max(0, remaining_wait - HZ); if (closure_wait_event_timeout(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), remaining_wait)) return ret; @@ -666,7 +733,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, printbuf_exit(&buf); closure_wait_event(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK)); return ret; } @@ -687,7 +754,6 @@ void bch2_journal_entry_res_resize(struct journal *j, goto out; j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); - smp_mb(); state = READ_ONCE(j->reservations); if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && @@ -907,7 +973,7 @@ int bch2_journal_meta(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) - return -EROFS; + return -BCH_ERR_erofs_no_writes; int ret = __bch2_journal_meta(j); bch2_write_ref_put(c, BCH_WRITE_REF_journal); @@ -951,7 +1017,8 @@ static void __bch2_journal_block(struct journal *j) new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); + if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) + journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); } } @@ -992,7 +1059,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou *blocked = true; } - ret = journal_state_count(s, idx) > open + ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open ? ERR_PTR(-EAGAIN) : buf; break; @@ -1349,6 +1416,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; j->flushed_seq_ondisk = cur_seq - 1; + j->seq_write_started = cur_seq - 1; j->seq_ondisk = cur_seq - 1; j->pin.front = last_seq; j->pin.back = cur_seq; @@ -1389,8 +1457,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; - j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); - j->reservations.unwritten_idx++; + j->reservations.idx = journal_cur_seq(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); @@ -1443,7 +1510,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) return -BCH_ERR_ENOMEM_dev_journal_init; @@ -1482,6 +1549,7 @@ void bch2_fs_journal_exit(struct journal *j) for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) kvfree(j->buf[i].data); + kvfree(j->free_buf); free_fifo(&j->pin); } @@ -1508,13 +1576,13 @@ int bch2_fs_journal_init(struct journal *j) if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) return -BCH_ERR_ENOMEM_journal_pin_fifo; - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { - j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); - if (!j->buf[i].data) - return -BCH_ERR_ENOMEM_journal_buf; + j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; + j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); + if (!j->free_buf) + return -BCH_ERR_ENOMEM_journal_buf; + + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; - } j->pin.front = j->pin.back = 1; @@ -1564,6 +1632,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "average write size:\t"); prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); prt_newline(out); + prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); @@ -1571,7 +1640,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); prt_printf(out, "blocked:\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error)); prt_printf(out, "current entry:\t"); switch (s.cur_entry_offset) { diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 107f7f901cd9..47828771f9c2 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j) closure_wake_up(&j->async_wait); } -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - return j->buf + j->reservations.idx; -} - /* Sequence number of oldest dirty journal entry */ static inline u64 journal_last_seq(struct journal *j) @@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j) return j->seq_ondisk + 1; } +static inline struct journal_buf *journal_cur_buf(struct journal *j) +{ + unsigned idx = (journal_cur_seq(j) & + JOURNAL_BUF_MASK & + ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; + + return j->buf + idx; +} + static inline int journal_state_count(union journal_res_state s, int idx) { switch (idx) { @@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx) BUG(); } +static inline int journal_state_seq_count(struct journal *j, + union journal_res_state s, u64 seq) +{ + if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR) + return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); + else + return 0; +} + static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; @@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) static inline struct jset_entry * journal_res_entry(struct journal *j, struct journal_res *res) { - return vstruct_idx(j->buf[res->idx].data, res->offset); + return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); } static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, @@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *); void bch2_journal_do_writes(struct journal *); void bch2_journal_buf_put_final(struct journal *, u64); -static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_STATE_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s bch2_journal_buf_put_final(j, seq); } -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_STATE_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, 0); - bch2_journal_buf_put(j, res->idx, res->seq); + bch2_journal_buf_put(j, res->seq); res->ref = 0; } @@ -335,8 +350,10 @@ static inline int journal_res_get_fast(struct journal *j, /* * Check if there is still room in the current journal - * entry: + * entry, smp_rmb() guarantees that reads from reservations.counter + * occur before accessing cur_entry_u64s: */ + smp_rmb(); if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; @@ -361,9 +378,9 @@ static inline int journal_res_get_fast(struct journal *j, &old.v, new.v)); res->ref = true; - res->idx = old.idx; res->offset = old.cur_entry_offset; - res->seq = le64_to_cpu(j->buf[old.idx].data->seq); + res->seq = journal_cur_seq(j); + res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; return 1; } @@ -390,6 +407,7 @@ out: (flags & JOURNAL_RES_GET_NONBLOCK) != 0, NULL, _THIS_IP_); EBUG_ON(!res->ref); + BUG_ON(!res->seq); } return 0; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 11c39e0c34f4..4ed6137f0439 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1041,13 +1041,19 @@ reread: bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, buf->data, sectors_read << 9); + u64 submit_time = local_clock(); ret = submit_bio_wait(bio); kfree(bio); - if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, - "journal read error: sector %llu", - offset) || - bch2_meta_read_fault("journal")) { + if (!ret && bch2_meta_read_fault("journal")) + ret = -BCH_ERR_EIO_fault_injected; + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + submit_time, !ret); + + if (ret) { + bch_err_dev_ratelimited(ca, + "journal read error: sector %llu", offset); /* * We don't error out of the recovery process * here, since the relevant journal entry may be @@ -1110,13 +1116,16 @@ reread: struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); - if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, - "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf))) + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) { + bch_err_dev_ratelimited(ca, "%s", + (printbuf_reset(&err), + prt_str(&err, "journal "), + bch2_csum_err_msg(&err, csum_type, j->csum, csum), + err.buf)); saw_bad = true; + } ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, @@ -1515,7 +1524,7 @@ static void __journal_write_alloc(struct journal *j, * @j: journal object * @w: journal buf (entry to be written) * - * Returns: 0 on success, or -EROFS on failure + * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure */ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { @@ -1600,18 +1609,12 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) kvfree(new_buf); } -static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) -{ - return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); -} - static CLOSURE_CALLBACK(journal_write_done) { closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - union journal_res_state old, new; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -1621,12 +1624,11 @@ static CLOSURE_CALLBACK(journal_write_done) if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); - err = -EIO; + err = -BCH_ERR_journal_write_err; } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); - if (bch2_mark_replicas(c, &replicas.e)) - err = -EIO; + err = bch2_mark_replicas(c, &replicas.e); } if (err) @@ -1641,7 +1643,23 @@ static CLOSURE_CALLBACK(journal_write_done) j->err_seq = seq; w->write_done = true; + if (!j->free_buf || j->free_buf_size < w->buf_size) { + swap(j->free_buf, w->data); + swap(j->free_buf_size, w->buf_size); + } + + if (w->data) { + void *buf = w->data; + w->data = NULL; + w->buf_size = 0; + + spin_unlock(&j->lock); + kvfree(buf); + spin_lock(&j->lock); + } + bool completed = false; + bool do_discards = false; for (seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); @@ -1650,11 +1668,10 @@ static CLOSURE_CALLBACK(journal_write_done) if (!w->write_done) break; - if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { + if (!j->err_seq && !w->noflush) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; - bch2_do_discards(c); closure_wake_up(&c->freelist_wait); bch2_reset_alloc_cursors(c); } @@ -1671,16 +1688,6 @@ static CLOSURE_CALLBACK(journal_write_done) if (j->watermark != BCH_WATERMARK_stripe) journal_reclaim_kick(&c->journal); - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); - BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - - new.unwritten_idx++; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - closure_wake_up(&w->wait); completed = true; } @@ -1695,7 +1702,7 @@ static CLOSURE_CALLBACK(journal_write_done) } if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && - new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; @@ -1715,6 +1722,9 @@ static CLOSURE_CALLBACK(journal_write_done) */ bch2_journal_do_writes(j); spin_unlock(&j->lock); + + if (do_discards) + bch2_do_discards(c); } static void journal_write_endio(struct bio *bio) @@ -1724,13 +1734,16 @@ static void journal_write_endio(struct bio *bio) struct journal *j = &ca->fs->journal; struct journal_buf *w = j->buf + jbio->buf_idx; - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + jbio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("journal")) { - unsigned long flags; + bch2_blk_status_to_str(bio->bi_status)); + unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); @@ -1759,7 +1772,11 @@ static CLOSURE_CALLBACK(journal_write_submit) sectors); struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio[w->idx]->bio; + struct journal_bio *jbio = ja->bio[w->idx]; + struct bio *bio = &jbio->bio; + + jbio->submit_time = local_clock(); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; @@ -1791,6 +1808,10 @@ static CLOSURE_CALLBACK(journal_write_preflush) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* + * Wait for previous journal writes to comelete; they won't necessarily + * be flushed if they're still in flight + */ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { @@ -1984,7 +2005,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * * write anything at all. */ if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) - return -EIO; + return error; if (error || w->noflush || diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index d373cd181a7f..5d1547aa118a 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -226,7 +226,7 @@ void bch2_journal_space_available(struct journal *j) bch_err(c, "%s", buf.buf); printbuf_exit(&buf); - ret = JOURNAL_ERR_insufficient_devices; + ret = -BCH_ERR_insufficient_journal_devices; goto out; } @@ -240,7 +240,7 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) - ret = JOURNAL_ERR_journal_full; + ret = -BCH_ERR_journal_full; if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -645,7 +645,6 @@ static u64 journal_seq_to_flush(struct journal *j) * @j: journal object * @direct: direct or background reclaim? * @kicked: requested to run since we last ran? - * Returns: 0 on success, or -EIO if the journal has been shutdown * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -685,10 +684,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (kthread && kthread_should_stop()) break; - if (bch2_journal_error(j)) { - ret = -EIO; + ret = bch2_journal_error(j); + if (ret) break; - } bch2_journal_do_discards(j); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 1f25c111c54c..e463d2d95359 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; BUG_ON(nr != t->nr); - unsigned i; - for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); - src < bl->start + nr; - src++, i = eytzinger0_next(i, nr)) { + src = bl->start; + eytzinger0_for_each(i, nr) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) *dst++ = *src; + src++; } unsigned new_nr = dst - bl->start; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 1ef3a28ed6ab..8e0eba776b9d 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -12,7 +12,11 @@ /* btree write buffer steals 8 bits for its own purposes: */ #define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) -#define JOURNAL_BUF_BITS 2 +#define JOURNAL_STATE_BUF_BITS 2 +#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) +#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) + +#define JOURNAL_BUF_BITS 4 #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) @@ -82,7 +86,6 @@ struct journal_entry_pin { struct journal_res { bool ref; - u8 idx; u16 u64s; u32 offset; u64 seq; @@ -98,9 +101,8 @@ union journal_res_state { }; struct { - u64 cur_entry_offset:20, + u64 cur_entry_offset:22, idx:2, - unwritten_idx:2, buf0_count:10, buf1_count:10, buf2_count:10, @@ -110,13 +112,13 @@ union journal_res_state { /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ /* * We stash some journal state as sentinal values in cur_entry_offset: * note - cur_entry_offset is in units of u64s */ -#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) +#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) #define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) @@ -149,28 +151,12 @@ enum journal_flags { #undef x }; -/* Reasons we may fail to get a journal reservation: */ -#define JOURNAL_ERRORS() \ - x(ok) \ - x(retry) \ - x(blocked) \ - x(max_in_flight) \ - x(journal_full) \ - x(journal_pin_full) \ - x(journal_stuck) \ - x(insufficient_devices) - -enum journal_errors { -#define x(n) JOURNAL_ERR_##n, - JOURNAL_ERRORS() -#undef x -}; - typedef DARRAY(u64) darray_u64; struct journal_bio { struct bch_dev *ca; unsigned buf_idx; + u64 submit_time; struct bio bio; }; @@ -199,7 +185,7 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - enum journal_errors cur_entry_error; + int cur_entry_error; unsigned cur_entry_offset_if_blocked; unsigned buf_size_want; @@ -220,6 +206,8 @@ struct journal { * other is possibly being written out. */ struct journal_buf buf[JOURNAL_BUF_NR]; + void *free_buf; + unsigned free_buf_size; spinlock_t lock; @@ -237,6 +225,7 @@ struct journal { /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; + u64 seq_write_started; /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 flushed_seq_ondisk; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index ce794d55818f..a299d9ec8ee4 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -6,6 +6,7 @@ #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" +#include "ec.h" #include "error.h" #include "lru.h" #include "recovery.h" @@ -59,9 +60,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); } -int bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) +int __bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) { if (old_time == new_time) return 0; @@ -78,7 +79,9 @@ static const char * const bch2_lru_types[] = { }; int bch2_lru_check_set(struct btree_trans *trans, - u16 lru_id, u64 time, + u16 lru_id, + u64 dev_bucket, + u64 time, struct bkey_s_c referring_k, struct bkey_buf *last_flushed) { @@ -87,9 +90,7 @@ int bch2_lru_check_set(struct btree_trans *trans, struct btree_iter lru_iter; struct bkey_s_c lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(lru_id, - bucket_to_u64(referring_k.k->p), - time), 0); + lru_pos(lru_id, dev_bucket, time), 0); int ret = bkey_err(lru_k); if (ret) return ret; @@ -104,7 +105,7 @@ int bch2_lru_check_set(struct btree_trans *trans, " %s", bch2_lru_types[lru_type(lru_k)], (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { - ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + ret = bch2_lru_set(trans, lru_id, dev_bucket, time); if (ret) goto err; } @@ -116,49 +117,73 @@ fsck_err: return ret; } +static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) +{ + enum bch_lru_type type = lru_type(lru_k); + + switch (type) { + case BCH_LRU_read: + case BCH_LRU_fragmentation: + return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); + case BCH_LRU_stripes: + return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset)); + default: + BUG(); + } +} + +static u64 bkey_lru_type_idx(struct bch_fs *c, + enum bch_lru_type type, + struct bkey_s_c k) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + + switch (type) { + case BCH_LRU_read: + a = bch2_alloc_to_v4(k, &a_convert); + return alloc_lru_idx_read(*a); + case BCH_LRU_fragmentation: { + a = bch2_alloc_to_v4(k, &a_convert); + + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); + u64 idx = ca + ? alloc_lru_idx_fragmentation(*a, ca) + : 0; + rcu_read_unlock(); + return idx; + } + case BCH_LRU_stripes: + return k.k->type == KEY_TYPE_stripe + ? stripe_lru_pos(bkey_s_c_to_stripe(k).v) + : 0; + default: + BUG(); + } +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; - enum bch_lru_type type = lru_type(lru_k); - struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); - u64 idx; - int ret; - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos); - if (fsck_err_on(!ca, - trans, lru_entry_to_invalid_bucket, - "lru key points to nonexistent device:bucket %llu:%llu", - alloc_pos.inode, alloc_pos.offset)) - return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); + struct bbpos bp = lru_pos_to_bp(lru_k); - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); + int ret = bkey_err(k); if (ret) goto err; - a = bch2_alloc_to_v4(k, &a_convert); - - switch (type) { - case BCH_LRU_read: - idx = alloc_lru_idx_read(*a); - break; - case BCH_LRU_fragmentation: - idx = alloc_lru_idx_fragmentation(*a, ca); - break; - } + enum bch_lru_type type = lru_type(lru_k); + u64 idx = bkey_lru_type_idx(c, type, k); - if (lru_k.k->type != KEY_TYPE_set || - lru_pos_time(lru_k.k->p) != idx) { + if (lru_pos_time(lru_k.k->p) != idx) { ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); if (ret) goto err; @@ -176,7 +201,6 @@ static int bch2_check_lru_key(struct btree_trans *trans, err: fsck_err: bch2_trans_iter_exit(trans, &iter); - bch2_dev_put(ca); printbuf_exit(&buf2); printbuf_exit(&buf1); return ret; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index f31a6cf1514c..8abd0aa2083a 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; - if (lru_id == BCH_LRU_FRAGMENTATION_START) + switch (lru_id) { + case BCH_LRU_BUCKET_FRAGMENTATION: return BCH_LRU_fragmentation; - return BCH_LRU_read; + case BCH_LRU_STRIPE_FRAGMENTATION: + return BCH_LRU_stripes; + default: + return BCH_LRU_read; + } } int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); @@ -46,10 +51,19 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos); int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); -int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); + +static inline int bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) +{ + return old_time != new_time + ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time) + : 0; +} struct bkey_buf; -int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); +int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); int bch2_check_lrus(struct bch_fs *); diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h index f372cb3b8cda..b7392ad8e41f 100644 --- a/fs/bcachefs/lru_format.h +++ b/fs/bcachefs/lru_format.h @@ -9,7 +9,8 @@ struct bch_lru { #define BCH_LRU_TYPES() \ x(read) \ - x(fragmentation) + x(fragmentation) \ + x(stripes) enum bch_lru_type { #define x(n) BCH_LRU_##n, @@ -17,7 +18,8 @@ enum bch_lru_type { #undef x }; -#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) +#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) +#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2) #define LRU_TIME_BITS 48 #define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index ddc187fb693d..57ad662871ba 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -15,6 +15,7 @@ #include "keylist.h" #include "migrate.h" #include "move.h" +#include "progress.h" #include "replicas.h" #include "super-io.h" @@ -76,7 +77,9 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, return 0; } -static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int bch2_dev_usrdata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, + unsigned dev_idx, int flags) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id id; @@ -88,8 +91,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); + bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); + })); if (ret) break; } @@ -99,7 +104,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) return ret; } -static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int bch2_dev_metadata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, + unsigned dev_idx, int flags) { struct btree_trans *trans; struct btree_iter iter; @@ -125,6 +132,8 @@ retry: while (bch2_trans_begin(trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { + bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; @@ -169,6 +178,11 @@ err: int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, dev_idx, flags); + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, + BIT_ULL(BTREE_ID_extents)| + BIT_ULL(BTREE_ID_reflink)); + + return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: + bch2_dev_metadata_drop(c, &progress, dev_idx, flags); } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 160b4374160a..8fcdc6984f6e 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, +static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - if (trace_move_extent_enabled()) { + if (trace_io_move_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_move_extent(c, buf.buf); + trace_io_move(c, buf.buf); printbuf_exit(&buf); } } -static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) +static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) { - if (trace_move_extent_read_enabled()) { + if (trace_io_move_read_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_read(c, buf.buf); + trace_io_move_read(c, buf.buf); printbuf_exit(&buf); } } @@ -74,11 +74,7 @@ struct moving_io { unsigned read_sectors; unsigned write_sectors; - struct bch_read_bio rbio; - struct data_update write; - /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[]; }; static void move_free(struct moving_io *io) @@ -88,43 +84,72 @@ static void move_free(struct moving_io *io) if (io->b) atomic_dec(&io->b->count); - bch2_data_update_exit(&io->write); - mutex_lock(&ctxt->lock); list_del(&io->io_list); wake_up(&ctxt->wait); mutex_unlock(&ctxt->lock); + if (!io->write.data_opts.scrub) { + bch2_data_update_exit(&io->write); + } else { + bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); + kfree(io->write.bvecs); + } kfree(io); } static void move_write_done(struct bch_write_op *op) { struct moving_io *io = container_of(op, struct moving_io, write.op); + struct bch_fs *c = op->c; struct moving_context *ctxt = io->write.ctxt; - if (io->write.op.error) + if (op->error) { + if (trace_io_move_write_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_write_op_to_text(&buf, op); + prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error)); + trace_io_move_write_fail(c, buf.buf); + printbuf_exit(&buf); + } + this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); + ctxt->write_error = true; + } - atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_dec(&io->write.ctxt->write_ios); + atomic_sub(io->write_sectors, &ctxt->write_sectors); + atomic_dec(&ctxt->write_ios); move_free(io); closure_put(&ctxt->cl); } static void move_write(struct moving_io *io) { - if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { + struct moving_context *ctxt = io->write.ctxt; + + if (ctxt->stats) { + if (io->write.rbio.bio.bi_status) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_uncorrected); + else if (io->write.rbio.saw_error) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_corrected); + } + + if (unlikely(io->write.rbio.ret || + io->write.rbio.bio.bi_status || + io->write.data_opts.scrub)) { move_free(io); return; } - if (trace_move_extent_write_enabled()) { + if (trace_io_move_write_enabled()) { struct bch_fs *c = io->write.op.c; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); - trace_move_extent_write(c, buf.buf); + trace_io_move_write(c, buf.buf); printbuf_exit(&buf); } @@ -132,7 +157,7 @@ static void move_write(struct moving_io *io) atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); - bch2_data_update_read_done(&io->write, io->rbio.pick.crc); + bch2_data_update_read_done(&io->write); } struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) @@ -145,7 +170,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx static void move_read_endio(struct bio *bio) { - struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); + struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); struct moving_context *ctxt = io->write.ctxt; atomic_sub(io->read_sectors, &ctxt->read_sectors); @@ -258,14 +283,10 @@ int bch2_move_extent(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct moving_io *io; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned sectors = k.k->size, pages; int ret = -ENOMEM; - trace_move_extent2(c, k, &io_opts, &data_opts); + trace_io_move2(c, k, &io_opts, &data_opts); + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); if (ctxt->stats) ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); @@ -273,7 +294,8 @@ int bch2_move_extent(struct moving_context *ctxt, bch2_data_update_opts_normalize(k, &data_opts); if (!data_opts.rewrite_ptrs && - !data_opts.extra_replicas) { + !data_opts.extra_replicas && + !data_opts.scrub) { if (data_opts.kill_ptrs) return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; @@ -285,13 +307,7 @@ int bch2_move_extent(struct moving_context *ctxt, */ bch2_trans_unlock(trans); - /* write path might have to decompress data: */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); - - pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - io = kzalloc(sizeof(struct moving_io) + - sizeof(struct bio_vec) * pages, GFP_KERNEL); + struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); if (!io) goto err; @@ -300,31 +316,27 @@ int bch2_move_extent(struct moving_context *ctxt, io->read_sectors = k.k->size; io->write_sectors = k.k->size; - bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); - io->write.op.wbio.bio.bi_ioprio = - IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - - if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, - GFP_KERNEL)) - goto err_free; + if (!data_opts.scrub) { + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + &io_opts, data_opts, iter->btree_id, k); + if (ret) + goto err_free; - io->rbio.c = c; - io->rbio.opts = io_opts; - bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); - io->rbio.bio.bi_vcnt = pages; - io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - io->rbio.bio.bi_iter.bi_size = sectors << 9; + io->write.op.end_io = move_write_done; + } else { + bch2_bkey_buf_init(&io->write.k); + bch2_bkey_buf_reassemble(&io->write.k, c, k); - io->rbio.bio.bi_opf = REQ_OP_READ; - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - io->rbio.bio.bi_end_io = move_read_endio; + io->write.op.c = c; + io->write.data_opts = data_opts; - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, - io_opts, data_opts, iter->btree_id, k); - if (ret) - goto err_free_pages; + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); + if (ret) + goto err_free; + } - io->write.op.end_io = move_write_done; + io->write.rbio.bio.bi_end_io = move_read_endio; + io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); if (ctxt->rate) bch2_ratelimit_increment(ctxt->rate, k.k->size); @@ -339,9 +351,7 @@ int bch2_move_extent(struct moving_context *ctxt, atomic_inc(&io->b->count); } - this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); - trace_move_extent_read2(c, k); + trace_io_move_read2(c, k); mutex_lock(&ctxt->lock); atomic_add(io->read_sectors, &ctxt->read_sectors); @@ -356,33 +366,33 @@ int bch2_move_extent(struct moving_context *ctxt, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(trans, &io->rbio, - bkey_start_pos(k.k), - iter->btree_id, k, 0, - BCH_READ_NODECODE| - BCH_READ_LAST_FRAGMENT); + __bch2_read_extent(trans, &io->write.rbio, + io->write.rbio.bio.bi_iter, + bkey_start_pos(k.k), + iter->btree_id, k, 0, + NULL, + BCH_READ_last_fragment, + data_opts.scrub ? data_opts.read_dev : -1); return 0; -err_free_pages: - bio_free_pages(&io->write.op.wbio.bio); err_free: kfree(io); err: - if (ret == -BCH_ERR_data_update_done) + if (bch2_err_matches(ret, BCH_ERR_data_update_done)) return 0; if (bch2_err_matches(ret, EROFS) || bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - count_event(c, move_extent_start_fail); + count_event(c, io_move_start_fail); - if (trace_move_extent_start_fail_enabled()) { + if (trace_io_move_start_fail_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, ": "); prt_str(&buf, bch2_err_str(ret)); - trace_move_extent_start_fail(c, buf.buf); + trace_io_move_start_fail(c, buf.buf); printbuf_exit(&buf); } return ret; @@ -551,6 +561,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, btree_id, start, BTREE_ITER_prefetch| + BTREE_ITER_not_extents| BTREE_ITER_all_snapshots); if (ctxt->rate) @@ -581,7 +592,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + s64 offset_into_extent = 0; bch2_trans_iter_exit(trans, &reflink_iter); k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); @@ -600,6 +611,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, * pointer - need to fixup iter->k */ extent_iter = &reflink_iter; + offset_into_extent = 0; } if (!bkey_extent_is_direct_data(k.k)) @@ -627,7 +639,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; - if (ret2 == -ENOMEM) { + if (bch2_err_matches(ret2, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); continue; @@ -689,21 +701,22 @@ int bch2_move_data(struct bch_fs *c, bool wait_on_copygc, move_pred_fn pred, void *arg) { - struct moving_context ctxt; - int ret; bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_move_data(&ctxt, start, end, pred, arg); + int ret = __bch2_move_data(&ctxt, start, end, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; } -int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts _data_opts) +static int __bch2_move_data_phys(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + unsigned dev, + u64 bucket_start, + u64 bucket_end, + unsigned data_types, + move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -712,16 +725,19 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; struct bkey_s_c k; - struct data_update_opts data_opts; - unsigned sectors_moved = 0; struct bkey_buf last_flushed; int ret = 0; - struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + struct bch_dev *ca = bch2_dev_tryget(c, dev); if (!ca) return 0; - trace_bucket_evacuate(c, &bucket); + bucket_end = min(bucket_end, ca->mi.nbuckets); + + struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); + struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); + bch2_dev_put(ca); + ca = NULL; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -732,8 +748,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, */ bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket), 0); + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); bch_err_msg(c, ret, "looking up alloc key"); if (ret) @@ -757,7 +772,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (ret) goto err; - if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) + if (!k.k || bkey_gt(k.k->p, bp_end)) break; if (k.k->type != KEY_TYPE_backpointer) @@ -765,107 +780,148 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - if (!bp.v->level) { - k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!k.k) - goto next; + if (ctxt->stats) + ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + if (!(data_types & BIT(bp.v->data_type))) + goto next; - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); + if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes) + goto next; + + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!k.k) + goto next; + if (!bp.v->level) { ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; } + } - data_opts = _data_opts; - data_opts.target = io_opts.background_target; - data_opts.rewrite_ptrs = 0; - - unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ - unsigned i = 0; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { - if (p.ptr.dev == bucket.inode) { - if (p.ptr.cached) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - data_opts.rewrite_ptrs |= 1U << i; - break; - } - i++; - } - - ret = bch2_move_extent(ctxt, bucket_in_flight, - &iter, k, io_opts, data_opts); + struct data_update_opts data_opts = {}; + if (!pred(c, arg, k, &io_opts, &data_opts)) { bch2_trans_iter_exit(trans, &iter); + goto next; + } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret == -ENOMEM) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - if (ret) - goto err; - - if (ctxt->stats) - atomic64_add(sectors, &ctxt->stats->sectors_seen); - sectors_moved += sectors; - } else { - struct btree *b; + if (data_opts.scrub && + !bch2_dev_idx_is_online(c, data_opts.read_dev)) { + bch2_trans_iter_exit(trans, &iter); + ret = -BCH_ERR_device_offline; + break; + } - b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); - ret = PTR_ERR_OR_ZERO(b); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - goto next; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!b) - goto next; + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); - unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); + /* move_extent will drop locks */ + unsigned sectors = bp.v->bucket_len; - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); - bch2_trans_iter_exit(trans, &iter); + if (!bp.v->level) + ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); + else if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + else + ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; + bch2_trans_iter_exit(trans, &iter); - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, sectors); - if (ctxt->stats) { - atomic64_add(sectors, &ctxt->stats->sectors_seen); - atomic64_add(sectors, &ctxt->stats->sectors_moved); - } - sectors_moved += btree_sectors(c); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; } + if (ret) + goto err; + + if (ctxt->stats) + atomic64_add(sectors, &ctxt->stats->sectors_seen); next: bch2_btree_iter_advance(&bp_iter); } - - trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); err: bch2_trans_iter_exit(trans, &bp_iter); - bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&last_flushed, c); return ret; } +static int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + struct moving_context ctxt; + + bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ctxt.stats->phys = true; + ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; + + int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + +struct evacuate_bucket_arg { + struct bpos bucket; + int gen; + struct data_update_opts data_opts; +}; + +static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct evacuate_bucket_arg *arg = _arg; + + *data_opts = arg->data_opts; + + unsigned i = 0; + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (ptr->dev == arg->bucket.inode && + (arg->gen < 0 || arg->gen == ptr->gen) && + !ptr->cached) + data_opts->rewrite_ptrs |= BIT(i); + i++; + } + + return data_opts->rewrite_ptrs != 0; +} + +int bch2_evacuate_bucket(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts data_opts) +{ + struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + + return __bch2_move_data_phys(ctxt, bucket_in_flight, + bucket.inode, + bucket.offset, + bucket.offset + 1, + ~0, + evacuate_bucket_pred, &arg); +} + typedef bool (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_update_opts *); @@ -1007,14 +1063,6 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } -static bool migrate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - /* * Ancient versions of bcachefs produced packed formats which could represent * keys that the in memory format cannot represent; this checks for those @@ -1104,6 +1152,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } +static bool scrub_pred(struct bch_fs *c, void *_arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct bch_ioctl_data *arg = _arg; + + if (k.k->type != KEY_TYPE_btree_ptr_v2) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == arg->migrate.dev) { + if (!p.crc.csum_type) + return false; + break; + } + } + + data_opts->scrub = true; + data_opts->read_dev = arg->migrate.dev; + return true; +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) @@ -1118,6 +1190,22 @@ int bch2_data_job(struct bch_fs *c, bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); switch (op.op) { + case BCH_DATA_OP_scrub: + /* + * prevent tests from spuriously failing, make sure we see all + * btree nodes that need to be repaired + */ + bch2_btree_interior_updates_flush(c); + + ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, + op.scrub.data_types, + NULL, + stats, + writepoint_hashed((unsigned long) current), + false, + scrub_pred, &op) ?: ret; + break; + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); @@ -1137,14 +1225,14 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_btree(c, start, end, - migrate_btree_pred, &op, stats) ?: ret; - ret = bch2_move_data(c, start, end, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - migrate_pred, &op) ?: ret; + ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, + ~0, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + migrate_pred, &op) ?: ret; + bch2_btree_interior_updates_flush(c); ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_rewrite_old_nodes: @@ -1176,17 +1264,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen: "); + prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_printf(out, "bytes moved: "); + prt_printf(out, "bytes moved:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_printf(out, "bytes raced: "); + prt_printf(out, "bytes raced:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1195,7 +1283,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) { - struct moving_io *io; + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); @@ -1215,8 +1304,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str printbuf_indent_add(out, 2); mutex_lock(&ctxt->lock); + struct moving_io *io; list_for_each_entry(io, &ctxt->ios, io_list) - bch2_write_op_to_text(out, &io->write.op); + bch2_data_update_inflight_to_text(out, &io->write); mutex_unlock(&ctxt->lock); printbuf_indent_sub(out, 4); diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index e22841ef31e4..807f779f6f76 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -3,22 +3,36 @@ #define _BCACHEFS_MOVE_TYPES_H #include "bbpos_types.h" +#include "bcachefs_ioctl.h" struct bch_move_stats { - enum bch_data_type data_type; - struct bbpos pos; char name[32]; + bool phys; + enum bch_ioctl_data_event_ret ret; + + union { + struct { + enum bch_data_type data_type; + struct bbpos pos; + }; + struct { + unsigned dev; + u64 offset; + }; + }; atomic64_t keys_moved; atomic64_t keys_raced; atomic64_t sectors_seen; atomic64_t sectors_moved; atomic64_t sectors_raced; + atomic64_t sectors_error_corrected; + atomic64_t sectors_error_uncorrected; }; struct move_bucket_key { struct bpos bucket; - u8 gen; + unsigned gen; }; struct move_bucket { diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 6718dc37c5a3..5126c870ce5b 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -167,8 +167,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, bch2_trans_begin(trans); ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), - lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), 0, k, ({ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; int ret2 = 0; @@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "Currently calculated wait:\t"); prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } } static int bch2_copygc_thread(void *arg) diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c index 2c3d46ac70c6..93246ad31541 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/namei.c @@ -4,8 +4,8 @@ #include "acl.h" #include "btree_update.h" #include "dirent.h" -#include "fs-common.h" #include "inode.h" +#include "namei.h" #include "subvolume.h" #include "xattr.h" @@ -47,6 +47,10 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; + /* Inherit casefold state from parent. */ + if (S_ISDIR(mode)) + new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded; + if (!(flags & BCH_CREATE_SNAPSHOT)) { /* Normal create path - allocate a new inode: */ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); @@ -153,16 +157,14 @@ int bch2_create_trans(struct btree_trans *trans, dir_u->bi_nlink++; dir_u->bi_mtime = dir_u->bi_ctime = now; - ret = bch2_inode_write(trans, &dir_iter, dir_u); - if (ret) - goto err; - - ret = bch2_dirent_create(trans, dir, &dir_hash, - dir_type, - name, - dir_target, - &dir_offset, - STR_HASH_must_create|BTREE_ITER_with_updates); + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, + &dir_offset, + &dir_u->bi_size, + STR_HASH_must_create|BTREE_ITER_with_updates) ?: + bch2_inode_write(trans, &dir_iter, dir_u); if (ret) goto err; @@ -225,7 +227,9 @@ int bch2_link_trans(struct btree_trans *trans, ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), - name, inum.inum, &dir_offset, + name, inum.inum, + &dir_offset, + &dir_u->bi_size, STR_HASH_must_create); if (ret) goto err; @@ -417,8 +421,8 @@ int bch2_rename_trans(struct btree_trans *trans, } ret = bch2_dirent_rename(trans, - src_dir, &src_hash, - dst_dir, &dst_hash, + src_dir, &src_hash, &src_dir_u->bi_size, + dst_dir, &dst_hash, &dst_dir_u->bi_size, src_name, &src_inum, &src_offset, dst_name, &dst_inum, &dst_offset, mode); @@ -560,6 +564,8 @@ err: return ret; } +/* inum_to_path */ + static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) { bch2_printbuf_make_room(out, n); @@ -650,3 +656,179 @@ disconnected: prt_str_reversed(path, "(disconnected)"); goto out; } + +/* fsck */ + +static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; + int ret = 0; + + if (inode_points_to_dirent(target, d)) + return 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + fsck_err_on(S_ISDIR(target->bi_mode), + trans, inode_dir_missing_backpointer, + "directory with missing backpointer\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + fsck_err_on(target->bi_flags & BCH_INODE_unlinked, + trans, inode_unlinked_but_has_dirent, + "inode unlinked but has dirent\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + target->bi_flags &= ~BCH_INODE_unlinked; + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + return __bch2_fsck_write_inode(trans, target); + } + + if (bch2_inode_should_have_single_bp(target) && + !fsck_err(trans, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto err; + + struct bkey_s_c_dirent bp_dirent = + bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, + SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), + 0, dirent); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + bool backpointer_exists = !ret; + ret = 0; + + if (!backpointer_exists) { + if (fsck_err(trans, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target->bi_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target); + } + } else { + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (S_ISDIR(target->bi_mode) || target->bi_subvol) { + /* + * XXX: verify connectivity of the other dirent + * up to the root before removing this one + * + * Additionally, bch2_lookup would need to cope with the + * dirent it found being removed - or should we remove + * the other one, even though the inode points to it? + */ + if (in_fsck) { + if (fsck_err(trans, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf)) + ret = bch2_fsck_remove_dirent(trans, d.k->p); + } else { + bch2_fs_inconsistent(c, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf); + } + + goto out; + } else { + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(!target->bi_nlink, + trans, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target); + if (ret) + goto err; + } + } + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +int __bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); + if (ret) + goto err; + + if (fsck_err_on(d.v->d_type != inode_d_type(target), + trans, dirent_d_type_wrong, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } + + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h index 2b59210bb5e8..2e6f6364767f 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/namei.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_COMMON_H -#define _BCACHEFS_FS_COMMON_H +#ifndef _BCACHEFS_NAMEI_H +#define _BCACHEFS_NAMEI_H #include "dirent.h" @@ -44,4 +44,29 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -#endif /* _BCACHEFS_FS_COMMON_H */ +int __bch2_check_dirent_target(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c_dirent, + struct bch_inode_unpacked *, bool); + +static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static inline int bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + if (likely(inode_points_to_dirent(target, d) && + d.v->d_type == inode_d_type(target))) + return 0; + + return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); +} + +#endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 6772faf385a5..81fd6b7977d3 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -163,16 +163,6 @@ const char * const bch2_d_types[BCH_DT_MAX] = { [DT_SUBVOL] = "subvol", }; -u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) -{ - BUG(); -} - -void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) -{ - BUG(); -} - void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { #define x(_name, ...) \ @@ -223,6 +213,21 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) } } +/* dummy option, for options that aren't stored in the superblock */ +typedef u64 (*sb_opt_get_fn)(const struct bch_sb *); +typedef void (*sb_opt_set_fn)(struct bch_sb *, u64); +typedef u64 (*member_opt_get_fn)(const struct bch_member *); +typedef void (*member_opt_set_fn)(struct bch_member *, u64); + +__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; +__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; +__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; +__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; + +#define type_compatible_or_null(_p, _type) \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL) + const struct bch_option bch2_opt_table[] = { #define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ @@ -239,15 +244,15 @@ const struct bch_option bch2_opt_table[] = { #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ [Opt_##_name] = { \ - .attr = { \ - .name = #_name, \ - .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ - }, \ - .flags = _flags, \ - .hint = _hint, \ - .help = _help, \ - .get_sb = _sb_opt, \ - .set_sb = SET_##_sb_opt, \ + .attr.name = #_name, \ + .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ + .flags = _flags, \ + .hint = _hint, \ + .help = _help, \ + .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \ + .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \ + .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \ + .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\ _type \ }, @@ -475,11 +480,18 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) +int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) { + lockdep_assert_held(&c->state_lock); + int ret = 0; switch (id) { + case Opt_state: + if (ca) + return __bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED); + break; + case Opt_compression: case Opt_background_compression: ret = bch2_check_set_has_compressed_data(c, v); @@ -495,12 +507,8 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) int bch2_opts_check_may_set(struct bch_fs *c) { - unsigned i; - int ret; - - for (i = 0; i < bch2_opts_nr; i++) { - ret = bch2_opt_check_may_set(c, i, - bch2_opt_get_by_id(&c->opts, i)); + for (unsigned i = 0; i < bch2_opts_nr; i++) { + int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } @@ -619,12 +627,25 @@ out: return ret; } -u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) +u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx) { const struct bch_option *opt = bch2_opt_table + id; u64 v; - v = opt->get_sb(sb); + if (dev_idx < 0) { + v = opt->get_sb(sb); + } else { + if (WARN(!bch2_member_exists(sb, dev_idx), + "tried to set device option %s on nonexistent device %i", + opt->attr.name, dev_idx)) + return 0; + + struct bch_member m = bch2_sb_member_get(sb, dev_idx); + v = opt->get_member(&m); + } + + if (opt->flags & OPT_SB_FIELD_ONE_BIAS) + --v; if (opt->flags & OPT_SB_FIELD_ILOG2) v = 1ULL << v; @@ -641,35 +662,19 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) */ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) { - unsigned id; - - for (id = 0; id < bch2_opts_nr; id++) { + for (unsigned id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - if (opt->get_sb == BCH2_NO_SB_OPT) - continue; - - bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); + if (opt->get_sb) + bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1)); } return 0; } -struct bch_dev_sb_opt_set { - void (*set_sb)(struct bch_member *, u64); -}; - -static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = { -#define x(n, set) [Opt_##n] = { .set_sb = SET_##set }, - BCH_DEV_OPT_SETTERS() -#undef x -}; - void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, const struct bch_option *opt, u64 v) { - enum bch_opt_id id = opt - bch2_opt_table; - if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; @@ -679,24 +684,16 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, if (opt->flags & OPT_SB_FIELD_ONE_BIAS) v++; - if (opt->flags & OPT_FS) { - if (opt->set_sb != SET_BCH2_NO_SB_OPT) - opt->set_sb(sb, v); - } + if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) + opt->set_sb(sb, v); - if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) { + if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { if (WARN(!bch2_member_exists(sb, dev_idx), "tried to set device option %s on nonexistent device %i", opt->attr.name, dev_idx)) return; - struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); - - const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id; - if (set->set_sb) - set->set_sb(m, v); - else - pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name); + opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v); } } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9d397fc2a1f0..bb621804d45a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -50,10 +50,6 @@ static inline const char *bch2_d_type_str(unsigned d_type) * apply the options from that struct that are defined. */ -/* dummy option, for options that aren't stored in the superblock */ -u64 BCH2_NO_SB_OPT(const struct bch_sb *); -void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); - /* When can be set: */ enum opt_flags { OPT_FS = BIT(0), /* Filesystem option */ @@ -132,19 +128,24 @@ enum fsck_err_opts { OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 16), \ - BCH_SB_BLOCK_SIZE, 8, \ + BCH_SB_BLOCK_SIZE, 4 << 10, \ "size", NULL) \ x(btree_node_size, u32, \ OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 20), \ - BCH_SB_BTREE_NODE_SIZE, 512, \ + BCH_SB_BTREE_NODE_SIZE, 256 << 10, \ "size", "Btree node size, default 256k") \ x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ + x(write_error_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 300), \ + BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ + NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ @@ -181,6 +182,11 @@ enum fsck_err_opts { OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ + x(checksum_err_retry_nr, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, 32), \ + BCH_SB_CSUM_ERR_RETRY_NR, 3, \ + NULL, NULL) \ x(compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ @@ -197,7 +203,7 @@ enum fsck_err_opts { BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or label for metadata writes") \ @@ -308,11 +314,6 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't kick drives out when splitbrain detected")\ - x(discard, u8, \ - OPT_FS|OPT_MOUNT|OPT_DEVICE, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable discard/TRIM support") \ x(verbose, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -493,27 +494,32 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, false, \ NULL, "Skip submit_bio() for data reads and writes, " \ "for performance testing purposes") \ - x(fs_size, u64, \ - OPT_DEVICE, \ + x(state, u64, \ + OPT_DEVICE|OPT_RUNTIME, \ + OPT_STR(bch2_member_states), \ + BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ + "state", "rw,ro,failed,spare") \ + x(bucket_size, u32, \ + OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, 0, \ - "size", "Size of filesystem on device") \ - x(bucket, u32, \ - OPT_DEVICE, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, 0, \ + BCH_MEMBER_BUCKET_SIZE, 0, \ "size", "Specifies the bucket size; must be greater than the btree node size")\ x(durability, u8, \ - OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ + OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ - BCH2_NO_SB_OPT, 1, \ + BCH_MEMBER_DURABILITY, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") \ x(data_allowed, u8, \ OPT_DEVICE, \ OPT_BITFIELD(__bch2_data_types), \ - BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ + BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ "types", "Allowed data types for this device: journal, btree, and/or user")\ + x(discard, u8, \ + OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_MEMBER_DISCARD, true, \ + NULL, "Enable discard/TRIM support") \ x(btree_node_prefetch, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -521,11 +527,6 @@ enum fsck_err_opts { NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ " prefetched sequentially") -#define BCH_DEV_OPT_SETTERS() \ - x(discard, BCH_MEMBER_DISCARD) \ - x(durability, BCH_MEMBER_DURABILITY) \ - x(data_allowed, BCH_MEMBER_DATA_ALLOWED) - struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; BCH_OPTS() @@ -582,8 +583,6 @@ struct printbuf; struct bch_option { struct attribute attr; - u64 (*get_sb)(const struct bch_sb *); - void (*set_sb)(struct bch_sb *, u64); enum opt_type type; enum opt_flags flags; u64 min, max; @@ -595,6 +594,12 @@ struct bch_option { const char *hint; const char *help; + u64 (*get_sb)(const struct bch_sb *); + void (*set_sb)(struct bch_sb *, u64); + + u64 (*get_member)(const struct bch_member *); + void (*set_member)(struct bch_member *, u64); + }; extern const struct bch_option bch2_opt_table[]; @@ -603,7 +608,7 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); +u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); @@ -625,7 +630,7 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_check_may_set(struct bch_fs *, int, u64); +int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64); int bch2_opts_check_may_set(struct bch_fs *); int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c new file mode 100644 index 000000000000..bafd1c91a802 --- /dev/null +++ b/fs/bcachefs/progress.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bbpos.h" +#include "disk_accounting.h" +#include "progress.h" + +void bch2_progress_init(struct progress_indicator_state *s, + struct bch_fs *c, + u64 btree_id_mask) +{ + memset(s, 0, sizeof(*s)); + + s->next_print = jiffies + HZ * 10; + + for (unsigned i = 0; i < BTREE_ID_NR; i++) { + if (!(btree_id_mask & BIT_ULL(i))) + continue; + + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_btree, + .btree.id = i, + }; + + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + s->nodes_total += div64_ul(v, btree_sectors(c)); + } +} + +static inline bool progress_update_p(struct progress_indicator_state *s) +{ + bool ret = time_after_eq(jiffies, s->next_print); + + if (ret) + s->next_print = jiffies + HZ * 10; + return ret; +} + +void bch2_progress_update_iter(struct btree_trans *trans, + struct progress_indicator_state *s, + struct btree_iter *iter, + const char *msg) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(btree_iter_path(trans, iter))->b; + + s->nodes_seen += b != s->last_node; + s->last_node = b; + + if (progress_update_p(s)) { + struct printbuf buf = PRINTBUF; + unsigned percent = s->nodes_total + ? div64_u64(s->nodes_seen * 100, s->nodes_total) + : 0; + + prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", + msg, percent, s->nodes_seen, s->nodes_total); + bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); + + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } +} diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h new file mode 100644 index 000000000000..23fb1811f943 --- /dev/null +++ b/fs/bcachefs/progress.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_PROGRESS_H +#define _BCACHEFS_PROGRESS_H + +/* + * Lame progress indicators + * + * We don't like to use these because they print to the dmesg console, which is + * spammy - we much prefer to be wired up to a userspace programm (e.g. via + * thread_with_file) and have it print the progress indicator. + * + * But some code is old and doesn't support that, or runs in a context where + * that's not yet practical (mount). + */ + +struct progress_indicator_state { + unsigned long next_print; + u64 nodes_seen; + u64 nodes_total; + struct btree *last_node; +}; + +void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); +void bch2_progress_update_iter(struct btree_trans *, + struct progress_indicator_state *, + struct btree_iter *, + const char *); + +#endif /* _BCACHEFS_PROGRESS_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index d0a1f5cd5c2b..29a569384146 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -26,9 +26,8 @@ /* bch_extent_rebalance: */ -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; bkey_extent_entry_for_each(ptrs, entry) @@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s return NULL; } +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); +} + static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s_c k, @@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) { - const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); if (!opts) return 0; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; @@ -341,7 +346,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, memset(data_opts, 0, sizeof(*data_opts)); data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->write_flags |= BCH_WRITE_only_specified_devs; if (!data_opts->rewrite_ptrs) { /* @@ -449,7 +454,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, { data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->write_flags |= BCH_WRITE_only_specified_devs; return data_opts->rewrite_ptrs != 0; } @@ -590,8 +595,19 @@ static int bch2_rebalance_thread(void *arg) void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) { + printbuf_tabstop_push(out, 32); + struct bch_fs_rebalance *r = &c->rebalance; + /* print pending work */ + struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, }; + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + + prt_printf(out, "pending work:\t"); + prt_human_readable_u64(out, v); + prt_printf(out, "\n\n"); + prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); printbuf_indent_add(out, 2); @@ -600,15 +616,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) case BCH_REBALANCE_waiting: { u64 now = atomic64_read(&c->io_clock[WRITE].now); - prt_str(out, "io wait duration: "); + prt_printf(out, "io wait duration:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); - prt_str(out, "io wait remaining: "); + prt_printf(out, "io wait remaining:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); - prt_str(out, "duration waited: "); + prt_printf(out, "duration waited:\t"); bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); prt_newline(out); break; @@ -621,6 +637,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) break; } prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } + printbuf_indent_sub(out, 2); } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 71c786cdb192..266c5770c824 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -13,12 +13,12 @@ #include "disk_accounting.h" #include "errcode.h" #include "error.h" -#include "fs-common.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "logged_ops.h" #include "move.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" @@ -899,7 +899,7 @@ use_clean: * journal sequence numbers: */ if (!c->sb.clean) - journal_seq += 8; + journal_seq += JOURNAL_BUF_NR * 4; if (blacklist_seq != journal_seq) { ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 418557960ed6..e89b9c783285 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -24,7 +24,7 @@ x(check_topology, 4, 0) \ x(accounting_read, 39, PASS_ALWAYS) \ x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ + x(stripes_read, 1, 0) \ x(initialize_subvolumes, 2, 0) \ x(snapshots_read, 3, PASS_ALWAYS) \ x(check_allocations, 5, PASS_FSCK) \ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 441e648f28b5..68172c6eba21 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -185,12 +185,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, BUG_ON(missing_start < refd_start); BUG_ON(missing_end > refd_end); - if (fsck_err(trans, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - missing_start, missing_end)) { + struct bpos missing_pos = bkey_start_pos(p.k); + missing_pos.offset += missing_start - live_start; + + prt_printf(&buf, "pointer to missing indirect extent in "); + ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); + if (ret) + goto err; + + prt_printf(&buf, "-%llu\n ", (missing_pos.offset + (missing_end - missing_start)) << 9); + bch2_bkey_val_to_text(&buf, c, p.s_c); + + prt_printf(&buf, "\n missing reflink btree range %llu-%llu", + missing_start, missing_end); + + if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); ret = PTR_ERR_OR_ZERO(new); if (ret) @@ -597,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c, u64 dst_done = 0; u32 dst_snapshot, src_snapshot; bool reflink_p_may_update_opts_field = - bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); + !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c index 6992e7469112..2b4b8445d418 100644 --- a/fs/bcachefs/sb-counters.c +++ b/fs/bcachefs/sb-counters.c @@ -5,7 +5,13 @@ /* BCH_SB_FIELD_counters */ -static const char * const bch2_counter_names[] = { +static const u8 counters_to_stable_map[] = { +#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, + BCH_PERSISTENT_COUNTERS() +#undef x +}; + +const char * const bch2_counter_names[] = { #define x(t, n, ...) (#t), BCH_PERSISTENT_COUNTERS() #undef x @@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) return 0; return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; -}; +} static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { return 0; -}; +} static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_counters *ctrs = field_to_type(f, counters); unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - for (unsigned i = 0; i < nr; i++) - prt_printf(out, "%s \t%llu\n", - i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", - le64_to_cpu(ctrs->d[i])); -}; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + prt_printf(out, "%s \t%llu\n", + bch2_counter_names[i], + le64_to_cpu(ctrs->d[stable])); + } +} int bch2_sb_counters_to_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - u64 val = 0; - for (i = 0; i < BCH_COUNTER_NR; i++) + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) c->counters_on_mount[i] = 0; - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { - val = le64_to_cpu(ctrs->d[i]); - percpu_u64_set(&c->counters[i], val); - c->counters_on_mount[i] = val; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) { + u64 v = le64_to_cpu(ctrs->d[stable]); + percpu_u64_set(&c->counters[i], v); + c->counters_on_mount[i] = v; + } } + return 0; -}; +} int bch2_sb_counters_from_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); struct bch_sb_field_counters *ret; - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); if (nr < BCH_COUNTER_NR) { ret = bch2_sb_field_resize(&c->disk_sb, counters, - sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - + sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); if (ret) { ctrs = ret; nr = bch2_sb_counter_nr_entries(ctrs); } } + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + } - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) - ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); return 0; } @@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = { .validate = bch2_sb_counters_validate, .to_text = bch2_sb_counters_to_text, }; + +#ifndef NO_BCACHEFS_CHARDEV +long bch2_ioctl_query_counters(struct bch_fs *c, + struct bch_ioctl_query_counters __user *user_arg) +{ + struct bch_ioctl_query_counters arg; + int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); + if (ret) + return ret; + + if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || + arg.pad) + return -EINVAL; + + arg.nr = min(arg.nr, BCH_COUNTER_NR); + ret = put_user(arg.nr, &user_arg->nr); + if (ret) + return ret; + + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + + if (stable < arg.nr) { + u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) + ? percpu_u64_get(&c->counters[i]) + : c->counters_on_mount[i]; + + ret = put_user(v, &user_arg->d[stable]); + if (ret) + return ret; + } + } + + return 0; +} +#endif diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h index 81f8aec9fcb1..a4329ad8dd1b 100644 --- a/fs/bcachefs/sb-counters.h +++ b/fs/bcachefs/sb-counters.h @@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *); void bch2_fs_counters_exit(struct bch_fs *); int bch2_fs_counters_init(struct bch_fs *); +extern const char * const bch2_counter_names[]; extern const struct bch_sb_field_ops bch_sb_field_ops_counters; +long bch2_ioctl_query_counters(struct bch_fs *, + struct bch_ioctl_query_counters __user *); + #endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index fdcf598f08b1..fa27ec59a647 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -9,10 +9,24 @@ enum counters_flags { #define BCH_PERSISTENT_COUNTERS() \ x(io_read, 0, TYPE_SECTORS) \ + x(io_read_inline, 80, TYPE_SECTORS) \ + x(io_read_hole, 81, TYPE_SECTORS) \ + x(io_read_promote, 30, TYPE_COUNTER) \ + x(io_read_bounce, 31, TYPE_COUNTER) \ + x(io_read_split, 33, TYPE_COUNTER) \ + x(io_read_reuse_race, 34, TYPE_COUNTER) \ + x(io_read_retry, 32, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ + x(io_move_read, 35, TYPE_SECTORS) \ + x(io_move_write, 36, TYPE_SECTORS) \ + x(io_move_finish, 37, TYPE_SECTORS) \ + x(io_move_fail, 38, TYPE_COUNTER) \ + x(io_move_write_fail, 82, TYPE_COUNTER) \ + x(io_move_start_fail, 39, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_discard_fast, 79, TYPE_COUNTER) \ x(bucket_alloc, 5, TYPE_COUNTER) \ x(bucket_alloc_fail, 6, TYPE_COUNTER) \ x(btree_cache_scan, 7, TYPE_COUNTER) \ @@ -38,16 +52,6 @@ enum counters_flags { x(journal_reclaim_finish, 27, TYPE_COUNTER) \ x(journal_reclaim_start, 28, TYPE_COUNTER) \ x(journal_write, 29, TYPE_COUNTER) \ - x(read_promote, 30, TYPE_COUNTER) \ - x(read_bounce, 31, TYPE_COUNTER) \ - x(read_split, 33, TYPE_COUNTER) \ - x(read_retry, 32, TYPE_COUNTER) \ - x(read_reuse_race, 34, TYPE_COUNTER) \ - x(move_extent_read, 35, TYPE_SECTORS) \ - x(move_extent_write, 36, TYPE_SECTORS) \ - x(move_extent_finish, 37, TYPE_SECTORS) \ - x(move_extent_fail, 38, TYPE_COUNTER) \ - x(move_extent_start_fail, 39, TYPE_COUNTER) \ x(copygc, 40, TYPE_COUNTER) \ x(copygc_wait, 41, TYPE_COUNTER) \ x(gc_gens_end, 42, TYPE_COUNTER) \ @@ -95,6 +99,13 @@ enum bch_persistent_counters { BCH_COUNTER_NR }; +enum bch_persistent_counters_stable { +#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_STABLE_NR +}; + struct bch_sb_field_counters { struct bch_sb_field field; __le64 d[]; diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 051214fdc735..acb5d845841e 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -90,7 +90,13 @@ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch, \ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(cached_backpointers, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(stripe_backpointers, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_ptr_to_missing_backpointer) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index b86ec013d7d7..67455beb8358 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -179,6 +179,7 @@ enum bch_fsck_flags { x(ptr_crc_redundant, 160, 0) \ x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ + x(extent_flags_not_at_start, 306, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ @@ -314,7 +315,9 @@ enum bch_fsck_flags { x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ - x(MAX, 304, 0) + x(dirent_cf_name_too_big, 304, 0) \ + x(dirent_stray_data_after_cf_name, 305, 0) \ + x(MAX, 307, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 762083b564ee..38261638a611 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -23,7 +23,19 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) return !percpu_ref_is_zero(&ca->io_ref); } -static inline bool bch2_dev_is_readable(struct bch_dev *ca) +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); + +static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + bool ret = ca && bch2_dev_is_online(ca); + rcu_read_unlock(); + + return ret; +} + +static inline bool bch2_dev_is_healthy(struct bch_dev *ca) { return bch2_dev_is_online(ca) && ca->mi.state != BCH_MEMBER_STATE_failed; @@ -271,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) { + might_sleep(); + rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, dev); if (ca && !percpu_ref_tryget(&ca->io_ref)) diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 2adf1221a440..3affec823b3f 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -79,6 +79,7 @@ struct bch_member { #define BCH_MEMBER_V1_BYTES 56 +LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16) LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) /* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index c54091a28909..e7f197896db1 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -146,8 +146,9 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) goto out; } - while (id && id < ancestor - IS_ANCESTOR_BITMAP) - id = get_ancestor_below(t, id, ancestor); + if (likely(ancestor >= IS_ANCESTOR_BITMAP)) + while (id && id < ancestor - IS_ANCESTOR_BITMAP) + id = get_ancestor_below(t, id, ancestor); ret = id && id < ancestor ? test_ancestor_bitmap(t, id, ancestor) @@ -389,7 +390,7 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) return 0; } -static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) { u32 id = snapshot_root; u32 subvol = 0, s; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 00373cf32e7b..81180181d7c9 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) return id; } +u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32); u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index d78451c2a0c6..93e71119e5a4 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -50,7 +50,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, for (unsigned i = 0; i < 1000; i++) { unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); if (u64s > U8_MAX) return -EINVAL; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 55a4ac7bf220..575ad1e03904 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -12,7 +12,6 @@ #include "super.h" #include <linux/crc32c.h> -#include <crypto/hash.h> #include <crypto/sha2.h> static inline enum bch_str_hash_type @@ -34,6 +33,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) struct bch_hash_info { u8 type; + struct unicode_map *cf_encoding; /* * For crc32 or crc64 string hashes the first key value of * the siphash_key (k0) is used as the key. @@ -47,17 +47,17 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) /* XXX ick */ struct bch_hash_info info = { .type = INODE_STR_HASH(bi), +#ifdef CONFIG_UNICODE + .cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL, +#endif .siphash_key = { .k0 = bi->bi_hash_seed } }; if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { - SHASH_DESC_ON_STACK(desc, c->sha256); u8 digest[SHA256_DIGEST_SIZE]; - desc->tfm = c->sha256; - - crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, - sizeof(bi->bi_hash_seed), digest); + sha256((const u8 *)&bi->bi_hash_seed, + sizeof(bi->bi_hash_seed), digest); memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index a81a7b6c0989..572b06bfa0b8 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -25,9 +25,6 @@ #include <linux/sort.h> #include <linux/string_choices.h> -static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { -}; - struct bch2_metadata_version { u16 version; const char *name; @@ -69,12 +66,14 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta return v; } -bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) +int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) { - bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && - version <= c->sb.version_incompat_allowed; + int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && + version <= c->sb.version_incompat_allowed) + ? 0 + : -BCH_ERR_may_not_use_incompat_feature; - if (ret) { + if (!ret) { mutex_lock(&c->sb_lock); SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); @@ -366,39 +365,41 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - enum bch_validate_flags flags, struct printbuf *out) +int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, + enum bch_validate_flags flags, struct printbuf *out) { - struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; - u16 block_size; int ret; ret = bch2_sb_compatible(sb, out); if (ret) return ret; - if (sb->features[1] || - (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { - prt_printf(out, "Filesystem has incompatible features"); + u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); + unsigned incompat_bit = 0; + if (incompat) + incompat_bit = __ffs64(incompat); + else if (sb->features[1]) + incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); + + if (incompat_bit) { + prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", + incompat_bit, + bch2_sb_features[BCH_FEATURE_NR - 1], + BCH_FEATURE_NR - 1); return -BCH_ERR_invalid_sb_features; } if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { - prt_printf(out, "Filesystem has incompatible version"); + prt_str(out, "Filesystem has incompatible version "); + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_str(out, ", current version "); + bch2_version_to_text(out, bcachefs_metadata_version_current); return -BCH_ERR_invalid_sb_features; } - block_size = le16_to_cpu(sb->block_size); - - if (block_size > PAGE_SECTORS) { - prt_printf(out, "Block size too big (got %u, max %u)", - block_size, PAGE_SECTORS); - return -BCH_ERR_invalid_sb_block_size; - } - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { prt_printf(out, "Bad user UUID (got zeroes)"); return -BCH_ERR_invalid_sb_uuid; @@ -409,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_uuid; } + if (!(flags & BCH_VALIDATE_write) && + le64_to_cpu(sb->offset) != read_offset) { + prt_printf(out, "Bad sb offset (got %llu, read from %llu)", + le64_to_cpu(sb->offset), read_offset); + return -BCH_ERR_invalid_sb_offset; + } + if (!sb->nr_devices || sb->nr_devices > BCH_SB_MEMBERS_MAX) { prt_printf(out, "Bad number of member devices %u (max %u)", @@ -464,6 +472,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); + + if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) + SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && + !BCH_SB_CSUM_ERR_RETRY_NR(sb)) + SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); } #ifdef __KERNEL__ @@ -474,8 +489,8 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { const struct bch_option *opt = bch2_opt_table + opt_id; - if (opt->get_sb != BCH2_NO_SB_OPT) { - u64 v = bch2_opt_from_sb(sb, opt_id); + if (opt->get_sb) { + u64 v = bch2_opt_from_sb(sb, opt_id, -1); prt_printf(out, "Invalid option "); ret = bch2_opt_validate(opt, v, out); @@ -755,7 +770,7 @@ retry: memset(sb, 0, sizeof(*sb)); sb->mode = BLK_OPEN_READ; sb->have_bio = true; - sb->holder = kmalloc(1, GFP_KERNEL); + sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL); if (!sb->holder) return -ENOMEM; @@ -881,7 +896,7 @@ got_super: sb->have_layout = true; - ret = bch2_sb_validate(sb, 0, &err); + ret = bch2_sb_validate(sb->sb, offset, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -918,16 +933,16 @@ static void write_super_endio(struct bio *bio) { struct bch_dev *ca = bio->bi_private; + bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status); + /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "superblock %s error: %s", + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "superblock %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); ca->sb_write_error = 1; + } closure_put(&ca->fs->sb_write); percpu_ref_put(&ca->io_ref); @@ -1038,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c) darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); + ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; @@ -1166,7 +1181,7 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written), c, ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) - ret = -1; + ret = -BCH_ERR_erofs_sb_err; out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); @@ -1223,12 +1238,11 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) bch2_sb_field_resize(&c->disk_sb, downgrade, 0); c->disk_sb.sb->version = cpu_to_le16(new_version); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); if (incompat) { + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); } } @@ -1459,8 +1473,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, for (id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - if (opt->get_sb != BCH2_NO_SB_OPT) { - u64 v = bch2_opt_from_sb(sb, id); + if (opt->get_sb) { + u64 v = bch2_opt_from_sb(sb, id, -1); prt_printf(out, "%s:\t", opt->attr.name); bch2_opt_to_text(out, NULL, sb, opt, v, diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index b4cff9ebdebb..78f708a6fbcd 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -21,13 +21,13 @@ static inline bool bch2_version_compatible(u16 version) void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); -bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); +int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); -static inline bool bch2_request_incompat_feature(struct bch_fs *c, - enum bcachefs_metadata_version version) +static inline int bch2_request_incompat_feature(struct bch_fs *c, + enum bcachefs_metadata_version version) { return likely(version <= c->sb.version_incompat) - ? true + ? 0 : bch2_set_version_incompat(c, version); } @@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); +int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); + int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 0459c875e189..99f9a0aaa380 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -75,9 +75,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_DESCRIPTION("bcachefs filesystem"); -MODULE_SOFTDEP("pre: crc32c"); -MODULE_SOFTDEP("pre: crc64"); -MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); @@ -718,7 +715,7 @@ static int bch2_fs_online(struct bch_fs *c) kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: #endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: - bch2_opts_create_sysfs_files(&c->opts_dir); + bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); if (ret) { bch_err(c, "error creating sysfs objects"); return ret; @@ -837,6 +834,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; +#ifdef CONFIG_UNICODE + /* Default encoding until we can potentially have more as an option. */ + c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); + if (IS_ERR(c->cf_encoding)) { + printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + ret = -EINVAL; + goto err; + } +#else + if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { + printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); + ret = -EINVAL; + goto err; + } +#endif + pr_uuid(&name, c->sb.user_uuid.b); ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) @@ -1056,6 +1072,7 @@ int bch2_fs_start(struct bch_fs *c) } set_bit(BCH_FS_started, &c->flags); + wake_up(&c->ro_ref_wait); if (c->opts.read_only) { bch2_fs_read_only(c); @@ -1280,8 +1297,8 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) return 0; if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &c->kobj, - "dev-%u", ca->dev_idx); + ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: + bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); if (ret) return ret; } @@ -1412,6 +1429,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); + /* + * Stash pointer to the filesystem for blk_holder_ops - note that once + * attached to a filesystem, we will always close the block device + * before tearing down the filesystem object. + */ + ca->disk_sb.holder->c = ca->fs; + ca->dev = ca->disk_sb.bdev->bd_dev; percpu_ref_reinit(&ca->io_ref); @@ -1966,15 +1990,12 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; @@ -1998,6 +2019,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } +/* blk_holder_ops: */ + +static struct bch_fs *bdev_get_fs(struct block_device *bdev) + __releases(&bdev->bd_holder_lock) +{ + struct bch_sb_handle_holder *holder = bdev->bd_holder; + struct bch_fs *c = holder->c; + + if (c && !bch2_ro_ref_tryget(c)) + c = NULL; + + mutex_unlock(&bdev->bd_holder_lock); + + if (c) + wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); + return c; +} + +/* returns with ref on ca->ref */ +static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) +{ + for_each_member_device(c, ca) + if (ca->disk_sb.bdev == bdev) + return ca; + return NULL; +} + +static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + } + + down_write(&c->state_lock); + struct bch_dev *ca = bdev_to_bch_dev(c, bdev); + if (!ca) + goto unlock; + + if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) { + __bch2_dev_offline(c, ca); + } else { + if (sb) { + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + evict_inodes(sb); + } + + bch2_journal_flush(&c->journal); + bch2_fs_emergency_read_only(c); + } + + bch2_dev_put(ca); +unlock: + if (sb) + up_read(&sb->s_umount); + up_write(&c->state_lock); + bch2_ro_ref_put(c); +} + +static void bch2_fs_bdev_sync(struct block_device *bdev) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + sync_filesystem(sb); + up_read(&sb->s_umount); + } + + bch2_ro_ref_put(c); +} + +const struct blk_holder_ops bch2_sb_handle_bdev_ops = { + .mark_dead = bch2_fs_bdev_mark_dead, + .sync = bch2_fs_bdev_sync, +}; + /* Filesystem open: */ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 04f8287eff5c..23533bce5709 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); +extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; + #endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 368a63d938cf..3a899f799d1d 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -2,13 +2,19 @@ #ifndef _BCACHEFS_SUPER_TYPES_H #define _BCACHEFS_SUPER_TYPES_H +struct bch_fs; + +struct bch_sb_handle_holder { + struct bch_fs *c; +}; + struct bch_sb_handle { struct bch_sb *sb; struct file *s_bdev_file; struct block_device *bdev; char *sb_name; struct bio *bio; - void *holder; + struct bch_sb_handle_holder *holder; size_t buffer_size; blk_mode_t mode; unsigned have_layout:1; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index a7eb1f511484..251ba8224c1f 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -146,15 +146,14 @@ write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); write_attribute(trigger_freelist_wakeup); +write_attribute(trigger_btree_updates); read_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); read_attribute(flags); -read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); -rw_attribute(durability); read_attribute(io_done); read_attribute(io_errors); write_attribute(io_errors_reset); @@ -173,10 +172,8 @@ read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_reserve_cache); -read_attribute(stripes_heap); read_attribute(open_buckets); read_attribute(open_buckets_partial); -read_attribute(write_points); read_attribute(nocow_lock_table); #ifdef BCH_WRITE_REF_DEBUG @@ -209,8 +206,6 @@ read_attribute(usage_base); BCH_PERSISTENT_COUNTERS() #undef x -rw_attribute(discard); -read_attribute(state); rw_attribute(label); read_attribute(copy_gc_wait); @@ -355,18 +350,12 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_reserve_cache) bch2_btree_reserve_cache_to_text(out, c); - if (attr == &sysfs_stripes_heap) - bch2_stripes_heap_to_text(out, c); - if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, NULL); if (attr == &sysfs_open_buckets_partial) bch2_open_buckets_partial_to_text(out, c); - if (attr == &sysfs_write_points) - bch2_write_points_to_text(out, c); - if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); @@ -415,6 +404,9 @@ STORE(bch2_fs) /* Debugging: */ + if (attr == &sysfs_trigger_btree_updates) + queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; @@ -566,10 +558,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_key_cache, &sysfs_btree_reserve_cache, &sysfs_new_stripes, - &sysfs_stripes_heap, &sysfs_open_buckets, &sysfs_open_buckets_partial, - &sysfs_write_points, #ifdef BCH_WRITE_REF_DEBUG &sysfs_write_refs, #endif @@ -585,6 +575,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, &sysfs_trigger_freelist_wakeup, + &sysfs_trigger_btree_updates, &sysfs_gc_gens_pos, @@ -604,26 +595,34 @@ struct attribute *bch2_fs_internal_files[] = { /* options */ -SHOW(bch2_fs_opts_dir) +static ssize_t sysfs_opt_show(struct bch_fs *c, + struct bch_dev *ca, + enum bch_opt_id id, + struct printbuf *out) { - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int id = opt - bch2_opt_table; - u64 v = bch2_opt_get_by_id(&c->opts, id); + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + if (opt->flags & OPT_FS) { + v = bch2_opt_get_by_id(&c->opts, id); + } else if ((opt->flags & OPT_DEVICE) && opt->get_member) { + v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx); + } else { + return -EINVAL; + } bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); prt_char(out, '\n'); - return 0; } -STORE(bch2_fs_opts_dir) +static ssize_t sysfs_opt_store(struct bch_fs *c, + struct bch_dev *ca, + enum bch_opt_id id, + const char *buf, size_t size) { - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int ret, id = opt - bch2_opt_table; - char *tmp; - u64 v; + const struct bch_option *opt = bch2_opt_table + id; + int ret = 0; /* * We don't need to take c->writes for correctness, but it eliminates an @@ -632,27 +631,28 @@ STORE(bch2_fs_opts_dir) if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) return -EROFS; - tmp = kstrdup(buf, GFP_KERNEL); + down_write(&c->state_lock); + + char *tmp = kstrdup(buf, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto err; } - ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); + u64 v; + ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: + bch2_opt_check_may_set(c, ca, id, v); kfree(tmp); if (ret < 0) goto err; - ret = bch2_opt_check_may_set(c, id, v); - if (ret < 0) - goto err; - - bch2_opt_set_sb(c, NULL, opt, v); + bch2_opt_set_sb(c, ca, opt, v); bch2_opt_set_by_id(&c->opts, id, v); if (v && (id == Opt_background_target || + (id == Opt_foreground_target && !c->opts.background_target) || id == Opt_background_compression || (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); @@ -664,27 +664,56 @@ STORE(bch2_fs_opts_dir) c->copygc_thread) wake_up_process(c->copygc_thread); + if (id == Opt_discard && !ca) { + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) + opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + ret = size; err: + up_write(&c->state_lock); bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return ret; } + +SHOW(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + int id = bch2_opt_lookup(attr->name); + if (id < 0) + return 0; + + return sysfs_opt_show(c, NULL, id, out); +} + +STORE(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + int id = bch2_opt_lookup(attr->name); + if (id < 0) + return 0; + + return sysfs_opt_store(c, NULL, id, buf, size); +} SYSFS_OPS(bch2_fs_opts_dir); struct attribute *bch2_fs_opts_dir_files[] = { NULL }; -int bch2_opts_create_sysfs_files(struct kobject *kobj) +int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) { - const struct bch_option *i; - int ret; - - for (i = bch2_opt_table; + for (const struct bch_option *i = bch2_opt_table; i < bch2_opt_table + bch2_opts_nr; i++) { - if (!(i->flags & OPT_FS)) + if (i->flags & OPT_HIDDEN) + continue; + if (!(i->flags & type)) continue; - ret = sysfs_create_file(kobj, &i->attr); + int ret = sysfs_create_file(kobj, &i->attr); if (ret) return ret; } @@ -755,11 +784,8 @@ SHOW(bch2_dev) sysfs_printf(uuid, "%pU\n", ca->uuid.b); - sysfs_print(bucket_size, bucket_bytes(ca)); sysfs_print(first_bucket, ca->mi.first_bucket); sysfs_print(nbuckets, ca->mi.nbuckets); - sysfs_print(durability, ca->mi.durability); - sysfs_print(discard, ca->mi.discard); if (attr == &sysfs_label) { if (ca->mi.group) @@ -772,11 +798,6 @@ SHOW(bch2_dev) prt_char(out, '\n'); } - if (attr == &sysfs_state) { - prt_string_option(out, bch2_member_states, ca->mi.state); - prt_char(out, '\n'); - } - if (attr == &sysfs_io_done) dev_io_done_to_text(out, ca); @@ -802,6 +823,10 @@ SHOW(bch2_dev) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, ca); + int opt_id = bch2_opt_lookup(attr->name); + if (opt_id >= 0) + return sysfs_opt_show(c, ca, opt_id, out); + return 0; } @@ -810,18 +835,6 @@ STORE(bch2_dev) struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - - bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v); - } - - if (attr == &sysfs_durability) { - u64 v = strtoul_or_return(buf); - - bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v); - } - if (attr == &sysfs_label) { char *tmp; int ret; @@ -839,20 +852,20 @@ STORE(bch2_dev) if (attr == &sysfs_io_errors_reset) bch2_dev_errors_reset(ca); + int opt_id = bch2_opt_lookup(attr->name); + if (opt_id >= 0) + return sysfs_opt_store(c, ca, opt_id, buf, size); + return size; } SYSFS_OPS(bch2_dev); struct attribute *bch2_dev_files[] = { &sysfs_uuid, - &sysfs_bucket_size, &sysfs_first_bucket, &sysfs_nbuckets, - &sysfs_durability, /* settings: */ - &sysfs_discard, - &sysfs_state, &sysfs_label, &sysfs_has_data, diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h index 222cd5062702..303e0433c702 100644 --- a/fs/bcachefs/sysfs.h +++ b/fs/bcachefs/sysfs.h @@ -23,7 +23,7 @@ extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; extern const struct sysfs_ops bch2_dev_sysfs_ops; -int bch2_opts_create_sysfs_files(struct kobject *); +int bch2_opts_create_sysfs_files(struct kobject *, unsigned); #else @@ -41,7 +41,8 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; static const struct sysfs_ops bch2_dev_sysfs_ops; -static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } +static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) +{ return 0; } #endif /* NO_BCACHEFS_SYSFS */ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index c1b51009edf6..519d00d62ae7 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -295,12 +295,12 @@ TRACE_EVENT(write_super, /* io.c: */ -DEFINE_EVENT(bio, read_promote, +DEFINE_EVENT(bio, io_read_promote, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -TRACE_EVENT(read_nopromote, +TRACE_EVENT(io_read_nopromote, TP_PROTO(struct bch_fs *c, int ret), TP_ARGS(c, ret), @@ -319,26 +319,50 @@ TRACE_EVENT(read_nopromote, __entry->ret) ); -DEFINE_EVENT(bio, read_bounce, +DEFINE_EVENT(bio, io_read_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_split, +DEFINE_EVENT(bio, io_read_split, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_retry, +DEFINE_EVENT(bio, io_read_retry, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_reuse_race, +DEFINE_EVENT(bio, io_read_reuse_race, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); +/* ec.c */ + +TRACE_EVENT(stripe_create, + TP_PROTO(struct bch_fs *c, u64 idx, int ret), + TP_ARGS(c, idx, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, idx ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->idx = idx; + __entry->ret = ret; + ), + + TP_printk("%d,%d idx %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->idx, + __entry->ret) +); + /* Journal */ DEFINE_EVENT(bch_fs, journal_full, @@ -797,53 +821,37 @@ TRACE_EVENT(bucket_invalidate, /* Moving IO */ -TRACE_EVENT(bucket_evacuate, - TP_PROTO(struct bch_fs *c, struct bpos *bucket), - TP_ARGS(c, bucket), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, dev_idx ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->dev_idx = bucket->inode; - __entry->bucket = bucket->offset; - ), - - TP_printk("%d:%d %u:%llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->dev_idx, __entry->bucket) +DEFINE_EVENT(fs_str, io_move, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent, +DEFINE_EVENT(fs_str, io_move_read, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_read, +DEFINE_EVENT(fs_str, io_move_write, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_write, +DEFINE_EVENT(fs_str, io_move_finish, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_finish, +DEFINE_EVENT(fs_str, io_move_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_fail, +DEFINE_EVENT(fs_str, io_move_write_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_start_fail, +DEFINE_EVENT(fs_str, io_move_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); @@ -881,37 +889,6 @@ TRACE_EVENT(move_data, __entry->sectors_raced) ); -TRACE_EVENT(evacuate_bucket, - TP_PROTO(struct bch_fs *c, struct bpos *bucket, - unsigned sectors, unsigned bucket_size, - int ret), - TP_ARGS(c, bucket, sectors, bucket_size, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, member ) - __field(u64, bucket ) - __field(u32, sectors ) - __field(u32, bucket_size ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->member = bucket->inode; - __entry->bucket = bucket->offset; - __entry->sectors = sectors; - __entry->bucket_size = bucket_size; - __entry->ret = ret; - ), - - TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->member, __entry->bucket, - __entry->sectors, __entry->bucket_size, - __entry->ret) -); - TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, u64 buckets, diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index da2cd11b3025..553de8d8e3e5 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -473,10 +473,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats u64 last_q = 0; prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + eytzinger0_for_each(j, NR_QUANTILES) { + bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; - u64 q = max(quantiles->entries[i].m, last_q); + u64 q = max(quantiles->entries[j].m, last_q); prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); if (is_last) prt_newline(out); @@ -704,12 +704,33 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *bio) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); + + bio_for_each_segment(bv, bio, iter) { + unsigned u64s = bv.bv_len / sizeof(u64); + + if (offset < u64s) { + u64 *segment = bvec_kmap_local(&bv); + segment[offset] = get_random_u64(); + kunmap_local(segment); + return; + } + offset -= u64s; + } +} +#endif + #if 0 void eytzinger1_test(void) { - unsigned inorder, eytz, size; + unsigned inorder, size; - pr_info("1 based eytzinger test:"); + pr_info("1 based eytzinger test:\n"); for (size = 2; size < 65536; @@ -717,13 +738,7 @@ void eytzinger1_test(void) unsigned extra = eytzinger1_extra(size); if (!(size % 4096)) - pr_info("tree size %u", size); - - BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); - BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); - - BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); - BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); + pr_info("tree size %u\n", size); inorder = 1; eytzinger1_for_each(eytz, size) { @@ -734,15 +749,16 @@ void eytzinger1_test(void) inorder++; } + BUG_ON(inorder - 1 != size); } } void eytzinger0_test(void) { - unsigned inorder, eytz, size; + unsigned inorder, size; - pr_info("0 based eytzinger test:"); + pr_info("0 based eytzinger test:\n"); for (size = 1; size < 65536; @@ -750,13 +766,7 @@ void eytzinger0_test(void) unsigned extra = eytzinger0_extra(size); if (!(size % 4096)) - pr_info("tree size %u", size); - - BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); - BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); - - BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); - BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); + pr_info("tree size %u\n", size); inorder = 0; eytzinger0_for_each(eytz, size) { @@ -767,54 +777,191 @@ void eytzinger0_test(void) inorder++; } + BUG_ON(inorder != size); + + inorder = size - 1; + eytzinger0_for_each_prev(eytz, size) { + BUG_ON(eytz != eytzinger0_first(size) && + eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); + + inorder--; + } + BUG_ON(inorder != -1); } } -static inline int cmp_u16(const void *_l, const void *_r, size_t size) +static inline int cmp_u16(const void *_l, const void *_r) { const u16 *l = _l, *r = _r; - return (*l > *r) - (*r - *l); + return (*l > *r) - (*r > *l); } -static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) { - int i, c1 = -1, c2 = -1; - ssize_t r; + int r, s; + bool bad; r = eytzinger0_find_le(test_array, nr, sizeof(test_array[0]), cmp_u16, &search); - if (r >= 0) - c1 = test_array[r]; - - for (i = 0; i < nr; i++) - if (test_array[i] <= search && test_array[i] > c2) - c2 = test_array[i]; - - if (c1 != c2) { - eytzinger0_for_each(i, nr) - pr_info("[%3u] = %12u", i, test_array[i]); - pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", - i, r, c1, c2); + if (r >= 0) { + if (test_array[r] > search) { + bad = true; + } else { + s = eytzinger0_next(r, nr); + bad = s >= 0 && test_array[s] <= search; + } + } else { + s = eytzinger0_last(nr); + bad = s >= 0 && test_array[s] <= search; + } + + if (bad) { + s = -1; + eytzinger0_for_each_prev(j, nr) { + if (test_array[j] <= search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_le(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); } } +static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) +{ + int r, s; + bool bad; + + r = eytzinger0_find_gt(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) { + if (test_array[r] <= search) { + bad = true; + } else { + s = eytzinger0_prev(r, nr); + bad = s >= 0 && test_array[s] > search; + } + } else { + s = eytzinger0_first(nr); + bad = s >= 0 && test_array[s] > search; + } + + if (bad) { + s = -1; + eytzinger0_for_each(j, nr) { + if (test_array[j] > search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_gt(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); + } +} + +static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) +{ + int r, s; + bool bad; + + r = eytzinger0_find_ge(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) { + if (test_array[r] < search) { + bad = true; + } else { + s = eytzinger0_prev(r, nr); + bad = s >= 0 && test_array[s] >= search; + } + } else { + s = eytzinger0_first(nr); + bad = s >= 0 && test_array[s] >= search; + } + + if (bad) { + s = -1; + eytzinger0_for_each(j, nr) { + if (test_array[j] >= search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_ge(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); + } +} + +static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) +{ + unsigned r; + int s; + bool bad; + + r = eytzinger0_find(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + + if (r < nr) { + bad = test_array[r] != search; + } else { + s = eytzinger0_find_le(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + bad = s >= 0 && test_array[s] == search; + } + + if (bad) { + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find(%12u) = %3i is incorrect\n", + search, r); + BUG(); + } +} + +static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +{ + eytzinger0_find_test_le(test_array, nr, search); + eytzinger0_find_test_gt(test_array, nr, search); + eytzinger0_find_test_ge(test_array, nr, search); + eytzinger0_find_test_eq(test_array, nr, search); +} + void eytzinger0_find_test(void) { unsigned i, nr, allocated = 1 << 12; u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); for (nr = 1; nr < allocated; nr++) { - pr_info("testing %u elems", nr); + u16 prev = 0; + + pr_info("testing %u elems\n", nr); get_random_bytes(test_array, nr * sizeof(test_array[0])); eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); /* verify array is sorted correctly: */ - eytzinger0_for_each(i, nr) - BUG_ON(i != eytzinger0_last(nr) && - test_array[i] > test_array[eytzinger0_next(i, nr)]); + eytzinger0_for_each(j, nr) { + BUG_ON(test_array[j] < prev); + prev = test_array[j]; + } for (i = 0; i < U16_MAX; i += 1 << 12) eytzinger0_find_test_val(test_array, nr, i); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index f4a4783219d9..7d921fc920a0 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -406,6 +406,18 @@ u64 bch2_get_random_u64_below(u64); void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *); + +static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) +{ + if (ratio && !get_random_u32_below(ratio)) + bch2_corrupt_bio(bio); +} +#else +#define bch2_maybe_corrupt_bio(...) do {} while (0) +#endif + static inline void memcpy_u64s_small(void *dst, const void *src, unsigned u64s) { @@ -419,7 +431,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src, static inline void __memcpy_u64s(void *dst, const void *src, unsigned u64s) { -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("rep ; movsq" @@ -496,7 +508,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src, u64 *dst = (u64 *) _dst + u64s - 1; u64 *src = (u64 *) _src + u64s - 1; -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("std ;\n" diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index aed7c6984173..f9667b944c0d 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -523,7 +523,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, if (ret < 0) goto err_class_exit; - ret = bch2_opt_check_may_set(c, opt_id, v); + ret = bch2_opt_check_may_set(c, NULL, opt_id, v); if (ret < 0) goto err_class_exit; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8054f44d39cf..584fa89bc877 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -762,8 +762,7 @@ static int parse_elf_property(const char *data, size_t *off, size_t datasz, } #define NOTE_DATA_SZ SZ_1K -#define GNU_PROPERTY_TYPE_0_NAME "GNU" -#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) +#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0)) static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, struct arch_elf_state *arch) @@ -800,7 +799,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || note.nhdr.n_namesz != NOTE_NAME_SZ || strncmp(note.data + sizeof(note.nhdr), - GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr))) return -ENOEXEC; off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, @@ -1603,14 +1602,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { copy_siginfo_to_external(csigdata, siginfo); - fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); + fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata); } /* @@ -1706,7 +1705,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm } size = name_curpos - (char *)data; - fill_note(note, "CORE", NT_FILE, size, data); + fill_note(note, NN_FILE, NT_FILE, size, data); return 0; } @@ -1767,7 +1766,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, PRSTATUS_SIZE, &t->prstatus); info->size += notesize(&t->notes[0]); @@ -1801,7 +1800,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (is_fpreg) SET_PR_FPVALID(&t->prstatus); - fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX", + fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX", note_type, ret, data); info->size += notesize(&t->notes[note_iter]); @@ -1821,7 +1820,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus)); info->size += notesize(&t->notes[0]); @@ -1832,7 +1831,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, } t->prstatus.pr_fpvalid = 1; - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu); + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu); info->size += notesize(&t->notes[1]); return 1; @@ -1852,7 +1851,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) return 0; - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); #ifdef CORE_DUMP_USE_REGSET view = task_user_regset_view(dump_task); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c13ee8180b17..9133f3827f90 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1024,7 +1024,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, /* deal with each load segment separately */ phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { - unsigned long maddr, disp, excess, excess1; + unsigned long maddr, disp, excess; int prot = 0, flags; if (phdr->p_type != PT_LOAD) @@ -1120,9 +1120,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, * extant in the file */ excess = phdr->p_memsz - phdr->p_filesz; - excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); #ifdef CONFIG_MMU + unsigned long excess1 + = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); if (excess > excess1) { unsigned long xaddr = maddr + phdr->p_filesz + excess1; unsigned long xmaddr; @@ -1397,7 +1398,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ regset_get(p, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &t->prstatus); t->num_notes++; *sz += notesize(&t->notes[0]); @@ -1415,7 +1416,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ } if (t->prstatus.pr_fpvalid) { - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu), &t->fpu); t->num_notes++; *sz += notesize(&t->notes[1]); @@ -1530,7 +1531,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) */ fill_psinfo(psinfo, current->group_leader, current->mm); - fill_note(&psinfo_note, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); thread_status_size += notesize(&psinfo_note); auxv = (elf_addr_t *) current->mm->saved_auxv; @@ -1538,7 +1539,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); thread_status_size += notesize(&auxv_note); offset = sizeof(*elf); /* ELF header */ diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 7a7e0ef69973..15ea6348800b 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> +#include "extent_io.h" struct extent_buffer; diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 48b9ddae4a46..0458cd51ed48 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +#include <linux/types.h> + struct posix_acl; struct inode; struct btrfs_trans_handle; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index a4c51600a408..f3bffe08b290 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; - int need_change = 0; + bool need_change = false; if (wq->thresh == NO_THRESHOLD) return; @@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) new_current_active--; new_current_active = clamp_val(new_current_active, 1, wq->limit_active); if (new_current_active != wq->current_active) { - need_change = 1; + need_change = true; wq->current_active = new_current_active; } out: spin_unlock(&wq->thres_lock); - if (need_change) { + if (need_change) workqueue_set_max_active(wq->normal_wq, wq->current_active); - } } static void run_ordered_work(struct btrfs_workqueue *wq, @@ -296,7 +295,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); struct btrfs_workqueue *wq = work->wq; - int need_order = 0; + bool need_order = false; /* * We should not touch things inside work in the following cases: @@ -307,7 +306,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) * So we save the needed things here. */ if (work->ordered_func) - need_order = 1; + need_order = true; trace_btrfs_work_sched(work); thresh_exec_hook(wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3d3923cfc357..5936cff80ff3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1399,11 +1399,11 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ASSERT(ctx->roots == NULL); key.objectid = ctx->bytenr; - key.offset = (u64)-1; if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; path = btrfs_alloc_path(); if (!path) @@ -2206,11 +2206,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_extent_item *ei; struct btrfs_key key; + key.objectid = logical; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index bc2555c44a12..8c2eee1f1878 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -97,33 +97,17 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return bbio; } -/* Free a bio that was never submitted to the underlying device. */ -static void btrfs_cleanup_bio(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) - btrfs_put_ordered_extent(bbio->ordered); - bio_put(&bbio->bio); -} - -static void __btrfs_bio_end_io(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) { - struct btrfs_ordered_extent *ordered = bbio->ordered; - - bbio->end_io(bbio); - btrfs_put_ordered_extent(ordered); - } else { - bbio->end_io(bbio); - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - btrfs_cleanup_bio(bbio); + /* Free bio that was never submitted to the underlying device. */ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); + bbio = orig_bbio; } @@ -138,7 +122,15 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) /* Load split bio's error which might be set above. */ if (status == BLK_STS_OK) bbio->bio.bi_status = READ_ONCE(bbio->status); - __btrfs_bio_end_io(bbio); + + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } } } @@ -581,7 +573,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); + btrfs_bio_end_io(async->bbio, bio->bi_status); return; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c0a8f7d92acc..a8129f1ce78c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -191,21 +191,21 @@ static int btrfs_bg_start_cmp(const struct rb_node *new, /* * This adds the block group to the fs_info rb tree for the block group cache */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group *block_group) +static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct rb_node *exist; int ret = 0; ASSERT(block_group->length != 0); - write_lock(&info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); exist = rb_find_add_cached(&block_group->cache_node, - &info->block_group_cache_tree, btrfs_bg_start_cmp); + &fs_info->block_group_cache_tree, btrfs_bg_start_cmp); if (exist) ret = -EEXIST; - write_unlock(&info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); return ret; } @@ -584,7 +584,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ struct btrfs_root *extent_root; u64 search_offset; u64 search_end = block_group->start + block_group->length; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret = 0; @@ -626,7 +626,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); - btrfs_free_path(path); return ret; } @@ -738,8 +737,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -785,8 +784,8 @@ next: if (key.objectid < last) { key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; btrfs_release_path(path); goto next; } @@ -1457,6 +1456,32 @@ out: } /* + * Link the block_group to a list via bg_list. + * + * @bg: The block_group to link to the list. + * @list: The list to link it to. + * + * Use this rather than list_add_tail() directly to ensure proper respect + * to locking and refcounting. + * + * Returns: true if the bg was linked with a refcount bump and false otherwise. + */ +static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + bool added = false; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, list); + added = true; + } + spin_unlock(&fs_info->unused_bgs_lock); + return added; +} + +/* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. */ @@ -1571,8 +1596,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * drop under the "next" label for the * fs_info->unused_bgs list. */ - btrfs_get_block_group(block_group); - list_add_tail(&block_group->bg_list, &retry_list); + btrfs_link_bg_list(block_group, &retry_list); trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1823,7 +1847,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; - u64 reclaimed; + u64 used; + u64 reserved; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1887,6 +1912,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the block group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. We also + * cache it before unlocking the block group, to prevent races + * (reports from KCSAN and such tools) with tasks updating it. + */ + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); spin_unlock(&space_info->lock); @@ -1903,31 +1939,47 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) goto next; + /* + * The amount of bytes reclaimed corresponds to the sum of the + * "used" and "reserved" counters. We have set the block group + * to RO above, which prevents reservations from happening but + * we may have existing reservations for which allocation has + * not yet been done - btrfs_update_block_group() was not yet + * called, which is where we will transfer a reserved extent's + * size from the "reserved" counter to the "used" counter - this + * happens when running delayed references. When we relocate the + * chunk below, relocation first flushes dellaloc, waits for + * ordered extent completion (which is where we create delayed + * references for data extents) and commits the current + * transaction (which runs delayed references), and only after + * it does the actual work to move extents out of the block + * group. So the reported amount of reclaimed bytes is + * effectively the sum of the 'used' and 'reserved' counters. + */ + spin_lock(&bg->lock); + used = bg->used; + reserved = bg->reserved; + spin_unlock(&bg->lock); + btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% unusable", + "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable", bg->start, - div64_u64(bg->used * 100, bg->length), + div64_u64(used * 100, bg->length), + div64_u64(reserved * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); - reclaimed = bg->used; ret = btrfs_relocate_chunk(fs_info, bg->start); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); - reclaimed = 0; + used = 0; + reserved = 0; spin_lock(&space_info->lock); space_info->reclaim_errors++; if (READ_ONCE(space_info->periodic_reclaim)) @@ -1936,24 +1988,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_lock(&space_info->lock); space_info->reclaim_count++; - space_info->reclaim_bytes += reclaimed; + space_info->reclaim_bytes += used; + space_info->reclaim_bytes += reserved; spin_unlock(&space_info->lock); next: - if (ret && !READ_ONCE(space_info->periodic_reclaim)) { - /* Refcount held by the reclaim_bgs list after splice. */ - spin_lock(&fs_info->unused_bgs_lock); - /* - * This block group might be added to the unused list - * during the above process. Move it back to the - * reclaim list otherwise. - */ - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); - list_add_tail(&bg->bg_list, &retry_list); - } - spin_unlock(&fs_info->unused_bgs_lock); - } + if (ret && !READ_ONCE(space_info->periodic_reclaim)) + btrfs_link_bg_list(bg, &retry_list); btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1993,13 +2034,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); + if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs)) trace_btrfs_add_reclaim_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); } static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, @@ -2410,7 +2446,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } - ret = btrfs_add_block_group_cache(info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); goto error; @@ -2459,7 +2495,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->cached = BTRFS_CACHE_FINISHED; bg->used = map->chunk_len; bg->flags = map->type; - ret = btrfs_add_block_group_cache(fs_info, bg); + ret = btrfs_add_block_group_cache(bg); /* * We may have some valid block group cache added already, in * that case we skip to the next one. @@ -2509,8 +2545,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) return fill_dummy_bgs(info); key.objectid = 0; - key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2641,7 +2677,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dev_extent *extent; struct extent_buffer *leaf; struct btrfs_key key; @@ -2658,7 +2694,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, key.offset = start; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) - goto out; + return ret; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); @@ -2666,10 +2702,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_objectid(leaf, extent, BTRFS_FIRST_CHUNK_TREE_OBJECTID); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - btrfs_set_dev_extent_length(leaf, extent, num_bytes); -out: - btrfs_free_path(path); + return ret; } @@ -2771,8 +2805,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); /* * If the block group is still unused, add it to the list of @@ -2888,7 +2926,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->space_info = btrfs_find_space_info(fs_info, cache->flags); ASSERT(cache->space_info); - ret = btrfs_add_block_group_cache(fs_info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); btrfs_put_block_group(cache); @@ -2910,7 +2948,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran } #endif - list_add_tail(&cache->bg_list, &trans->new_bgs); + btrfs_link_bg_list(cache, &trans->new_bgs); btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); @@ -3306,7 +3344,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); if (list_empty(&cur_trans->dirty_bgs) || !btrfs_test_opt(fs_info, SPACE_CACHE)) @@ -3323,7 +3361,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); } - btrfs_free_path(path); return 0; } @@ -3346,7 +3383,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; int loops = 0; @@ -3501,7 +3538,6 @@ out: btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } - btrfs_free_path(path); return ret; } @@ -3512,7 +3548,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct list_head *io = &cur_trans->io_bgs; path = btrfs_alloc_path(); @@ -3624,7 +3660,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) btrfs_put_block_group(cache); } - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b2fa33911c28..4e2952cf5766 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -145,6 +145,7 @@ struct btrfs_inode { * different from prop_compress and takes precedence if set. */ u8 defrag_compress; + s8 defrag_compress_level; /* * Lock for counters and all fields used to determine if the inode is in @@ -516,6 +517,14 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) lockdep_assert_held(&inode->vfs_inode.i_rwsem); } +static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATASUM) + mapping_clear_stable_writes(inode->vfs_inode.i_mapping); + else + mapping_set_stable_writes(inode->vfs_inode.i_mapping); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes @@ -524,7 +533,7 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); @@ -584,9 +593,9 @@ void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path); -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path); +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0c4d486c3048..e7f8ee5d48a4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -740,7 +740,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, unsigned int level) +static struct list_head *alloc_workspace(int type, int level) { switch (type) { case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); @@ -818,7 +818,7 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, unsigned int level) +struct list_head *btrfs_get_workspace(int type, int level) { struct workspace_manager *wsm; struct list_head *workspace; @@ -968,18 +968,28 @@ static void put_workspace(int type, struct list_head *ws) * Adjust @level according to the limits of the compression algorithm or * fallback to default */ -static unsigned int btrfs_compress_set_level(int type, unsigned level) +static int btrfs_compress_set_level(unsigned int type, int level) { const struct btrfs_compress_op *ops = btrfs_compress_op[type]; if (level == 0) level = ops->default_level; else - level = min(level, ops->max_level); + level = min(max(level, ops->min_level), ops->max_level); return level; } +/* + * Check whether the @level is within the valid range for the given type. + */ +bool btrfs_compress_level_valid(unsigned int type, int level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + return ops->min_level <= level && level <= ops->max_level; +} + /* Wrapper around find_get_page(), with extra error message. */ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret) @@ -1023,12 +1033,10 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { - int type = btrfs_compress_type(type_level); - int level = btrfs_compress_level(type_level); const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1590,18 +1598,19 @@ out: /* * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level + * level, unrecognized string will set the default level. Negative level + * numbers are allowed. */ -unsigned int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str) { - unsigned int level = 0; + int level = 0; int ret; if (!type) return 0; if (str[0] == ':') { - ret = kstrtouint(str + 1, 10, &level); + ret = kstrtoint(str + 1, 10, &level); if (ret) level = 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 954034086d0d..df198623cc08 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -72,16 +72,6 @@ struct compressed_bio { struct btrfs_bio bbio; }; -static inline unsigned int btrfs_compress_type(unsigned int type_level) -{ - return (type_level & 0xF); -} - -static inline unsigned int btrfs_compress_level(unsigned int type_level) -{ - return ((type_level & 0xF0) >> 4); -} - /* @range_end must be exclusive. */ static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) { @@ -93,7 +83,8 @@ static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +bool btrfs_compress_level_valid(unsigned int type, int level); +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -107,7 +98,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str); struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); @@ -131,14 +122,15 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, unsigned int level); +struct list_head *btrfs_get_workspace(int type, int level); void btrfs_put_workspace(int type, struct list_head *ws); struct btrfs_compress_op { struct workspace_manager *workspace_manager; /* Maximum level supported by the compression algorithm */ - unsigned int max_level; - unsigned int default_level; + int min_level; + int max_level; + int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ @@ -187,9 +179,9 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); +struct list_head *zstd_alloc_workspace(int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); +struct list_head *zstd_get_workspace(int level); void zstd_put_workspace(struct list_head *ws); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3dc5a35dd19b..a2e7979372cc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4306,7 +4306,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 data_size) { int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; unsigned long ptr; @@ -4320,7 +4320,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, write_extent_buffer(leaf, data, ptr, data_size); btrfs_mark_buffer_dirty(trans, leaf); } - btrfs_free_path(path); return ret; } @@ -4608,7 +4607,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u64 min_trans) { struct extent_buffer *cur; - struct btrfs_key found_key; int slot; int sret; u32 nritems; @@ -4644,7 +4642,8 @@ again: goto find_next_key; ret = 0; path->slots[level] = slot; - btrfs_item_key_to_cpu(cur, &found_key, slot); + /* Save our key for returning back. */ + btrfs_item_key_to_cpu(cur, min_key, slot); goto out; } if (sret && slot > 0) @@ -4668,8 +4667,8 @@ find_next_key: * we didn't find a candidate key in this node, walk forward * and find another one */ + path->slots[level] = slot; if (slot >= nritems) { - path->slots[level] = slot; sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -4679,11 +4678,10 @@ find_next_key: goto out; } } - /* save our key for returning back */ - btrfs_node_key_to_cpu(cur, &found_key, slot); - path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; + /* Save our key for returning back. */ + btrfs_node_key_to_cpu(cur, min_key, slot); goto out; } cur = btrfs_read_node_slot(cur, slot); @@ -4700,10 +4698,8 @@ find_next_key: } out: path->keep_locks = keep_locks; - if (ret == 0) { + if (ret == 0) btrfs_unlock_up_safe(path, path->lowest_level + 1); - memcpy(min_key, &found_key, sizeof(found_key)); - } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1096a80a64e7..075a06db43a1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,7 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include "linux/cleanup.h" +#include <linux/cleanup.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 968dae953948..d4310d93f532 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -225,7 +225,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct file_ra_state *ra) { struct btrfs_root *inode_root; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_ioctl_defrag_range_args range; int ret = 0; u64 cur = 0; @@ -250,24 +250,24 @@ again: goto cleanup; } - if (cur >= i_size_read(inode)) { - iput(inode); + if (cur >= i_size_read(&inode->vfs_inode)) { + iput(&inode->vfs_inode); goto cleanup; } /* Do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; - file_ra_state_init(ra, inode->i_mapping); + file_ra_state_init(ra, inode->vfs_inode.i_mapping); sb_start_write(fs_info->sb); ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); + BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - iput(inode); + iput(&inode->vfs_inode); if (ret < 0) goto cleanup; @@ -1352,17 +1352,18 @@ out: * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without * defragging all the range). */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; + int compress_level = 0; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; @@ -1376,10 +1377,21 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, return -EINVAL; if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { + if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress.type) { + compress_type = range->compress.type; + compress_level = range->compress.level; + if (!btrfs_compress_level_valid(compress_type, compress_level)) + return -EINVAL; + } + } else { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } } if (extent_thresh == 0) @@ -1402,8 +1414,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * defrag range can be written sequentially. */ start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; + if (start_index < inode->vfs_inode.i_mapping->writeback_index) + inode->vfs_inode.i_mapping->writeback_index = start_index; while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; @@ -1420,27 +1432,29 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; cluster_end = min(cluster_end, last_byte); - btrfs_inode_lock(BTRFS_I(inode), 0); - if (IS_SWAPFILE(inode)) { + btrfs_inode_lock(inode, 0); + if (IS_SWAPFILE(&inode->vfs_inode)) { ret = -ETXTBSY; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); break; } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(BTRFS_I(inode), 0); + if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(inode, 0); break; } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + if (do_compress) { + inode->defrag_compress = compress_type; + inode->defrag_compress_level = compress_level; + } + ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, newer_than, do_compress, §ors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; cur = max(cluster_end + 1, last_scanned); @@ -1462,10 +1476,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * need to be written back immediately. */ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); + filemap_flush(inode->vfs_inode.i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); + &inode->runtime_flags)) + filemap_flush(inode->vfs_inode.i_mapping); } if (range->compress_type == BTRFS_COMPRESS_LZO) btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); @@ -1474,9 +1488,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, ret = sectors_defragged; } if (do_compress) { - btrfs_inode_lock(BTRFS_I(inode), 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_lock(inode, 0); + inode->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(inode, 0); } return ret; } diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 6b7596c4f0dc..a7f917a38dbf 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -6,14 +6,14 @@ #include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct file_ra_state; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_root; struct btrfs_trans_handle; struct btrfs_ioctl_defrag_range_args; -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b4933c6a889..3f1551d8a5c6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1211,7 +1211,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1238,7 +1238,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); btrfs_release_delayed_node(delayed_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1817,53 +1816,53 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, - struct inode *inode) + struct btrfs_inode *inode) { + struct inode *vfs_inode = &inode->vfs_inode; u64 flags; - btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); - btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); - btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); - btrfs_set_stack_inode_mode(inode_item, inode->i_mode); - btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); - btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); - btrfs_set_stack_inode_generation(inode_item, - BTRFS_I(inode)->generation); + btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode)); + btrfs_set_stack_inode_size(inode_item, inode->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode)); + btrfs_set_stack_inode_generation(inode_item, inode->generation); btrfs_set_stack_inode_sequence(inode_item, - inode_peek_iversion(inode)); + inode_peek_iversion(vfs_inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); - btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, - BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev); + flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags); btrfs_set_stack_inode_flags(inode_item, flags); btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode_get_atime_sec(inode)); + inode_get_atime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode_get_atime_nsec(inode)); + inode_get_atime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode_get_mtime_sec(inode)); + inode_get_mtime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode_get_mtime_nsec(inode)); + inode_get_mtime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode_get_ctime_sec(inode)); + inode_get_ctime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode_get_ctime_nsec(inode)); + inode_get_ctime_nsec(vfs_inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec); } -int btrfs_fill_inode(struct inode *inode, u32 *rdev) +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; + struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1876,39 +1875,38 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode_item = &delayed_node->inode_item; - i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); - i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_mode = btrfs_stack_inode_mode(inode_item); - set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); - inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); - BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); - - inode_set_iversion_queried(inode, - btrfs_stack_inode_sequence(inode_item)); - inode->i_rdev = 0; + i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); + set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); + inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); + inode->generation = btrfs_stack_inode_generation(inode_item); + inode->last_trans = btrfs_stack_inode_transid(inode_item); + + inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item)); + vfs_inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); - inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime), btrfs_stack_timespec_nsec(&inode_item->atime)); - inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime), btrfs_stack_timespec_nsec(&inode_item->mtime)); - inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime), btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); + inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); - inode->i_generation = BTRFS_I(inode)->generation; - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + vfs_inode->i_generation = inode->generation; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); @@ -1928,8 +1926,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - fill_stack_inode_item(trans, &delayed_node->inode_item, - &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); goto release_node; } @@ -1937,7 +1934,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, if (ret) goto release_node; - fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f4d9feac0d0e..c4b4ba122beb 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -133,7 +133,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); -int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); /* Used for drop dead root */ diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a35067cebb97..f5ae880308d3 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -14,6 +14,8 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <uapi/linux/btrfs_tree.h> +#include "fs.h" +#include "messages.h" struct btrfs_trans_handle; struct btrfs_fs_info; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f86fbea0b3de..53d7d85cb4be 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct extent_buffer *eb; int slot; int ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); int item_size; struct btrfs_dev_replace_item *ptr; u64 src_devid; @@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) return 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -103,10 +101,8 @@ no_valid_dev_replace_entry_found: if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = @@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found: dev_replace->tgtdev = NULL; dev_replace->is_valid = 0; dev_replace->item_needs_writeback = 0; - goto out; + return 0; } slot = path->slots[0]; eb = path->nodes[0]; @@ -226,8 +222,6 @@ no_valid_dev_replace_entry_found: break; } -out: - btrfs_free_path(path); return ret; } @@ -346,7 +340,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_replace_item *ptr; @@ -365,16 +359,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) key.offset = 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); - goto out; + return ret; } if (ret == 0 && @@ -395,7 +388,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); - goto out; + return ret; } ret = 1; } @@ -408,7 +401,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) if (ret < 0) { btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); - goto out; + return ret; } } @@ -440,8 +433,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); -out: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index ccf91de29f80..b29cc31a7c4a 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -236,7 +236,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, int data_size; struct extent_buffer *leaf; int slot; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) @@ -251,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ - if (ret == -ENOENT) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; if (ret < 0) - goto out; + return ret; } /* we found an item, look for our name in the item */ if (di) { /* our exact name was found */ - ret = -EEXIST; - goto out; + return -EEXIST; } /* See if there is room in the item to insert this name. */ @@ -273,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { - ret = -EOVERFLOW; - } else { - /* plenty of insertion room */ - ret = 0; + return -EOVERFLOW; } -out: - btrfs_free_path(path); - return ret; + + /* Plenty of insertion room. */ + return 0; } /* diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 28d69970bc70..8462579a95f4 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -10,6 +10,7 @@ struct fscrypt_str; struct btrfs_fs_info; struct btrfs_key; struct btrfs_path; +struct btrfs_inode; struct btrfs_root; struct btrfs_trans_handle; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 8567af46e16f..a374ce7a1813 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -248,7 +248,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); block_start = extent_map_block_start(em) + (start - em->start); - if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) { + if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, + false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; @@ -855,6 +856,22 @@ relock: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } + /* + * We can't control the folios being passed in, applications can write + * to them while a direct IO write is in progress. This means the + * content might change after we calculated the data checksum. + * Therefore we can end up storing a checksum that doesn't match the + * persisted data. + * + * To be extra safe and avoid false data checksum mismatch, if the + * inode requires data checksum, just fallback to buffered IO. + * For buffered IO we have full control of page cache and can ensure + * no one is modifying the content during writeback. + */ + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } /* * The iov_iter can be mapped to the same file range we are writing to. diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h index 3dc3ea926afe..df5d45ee6de7 100644 --- a/fs/btrfs/direct-io.h +++ b/fs/btrfs/direct-io.h @@ -5,6 +5,8 @@ #include <linux/types.h> +struct kiocb; + int __init btrfs_init_dio(void); void __cold btrfs_destroy_dio(void); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e815d165cccc..d6eef4bd9e9d 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -167,13 +167,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -260,9 +254,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -493,9 +488,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); + return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -547,15 +553,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index dddb0f9101ba..2c5e85394092 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -3,6 +3,7 @@ #ifndef BTRFS_DISCARD_H #define BTRFS_DISCARD_H +#include <linux/types.h> #include <linux/sizes.h> struct btrfs_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f09db62e61a1..1a916716cefe 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -182,13 +182,12 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, @@ -284,8 +283,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(found_start != eb->start)) return BLK_STS_IOERR; - if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], - eb->start, eb->len))) + if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) return BLK_STS_IOERR; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, @@ -1089,21 +1087,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key) { struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = read_tree_root_path(tree_root, path, key); - btrfs_free_path(path); return root; } /* - * Initialize subvolume root in-memory structure + * Initialize subvolume root in-memory structure. * * @anon_dev: anonymous device to attach to the root, if zero, allocate new + * + * In case of failure the caller is responsible to call btrfs_free_fs_root() */ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { @@ -1127,7 +1126,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + return ret; } else { root->anon_dev = anon_dev; } @@ -1137,7 +1136,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); - goto fail; + return ret; } ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1145,9 +1144,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) mutex_unlock(&root->objectid_mutex); return 0; -fail: - /* The caller is responsible to call btrfs_free_fs_root */ - return ret; } static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, @@ -2200,8 +2196,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, static int load_global_roots(struct btrfs_root *tree_root) { - struct btrfs_path *path; - int ret = 0; + BTRFS_PATH_AUTO_FREE(path); + int ret; path = btrfs_alloc_path(); if (!path) @@ -2210,18 +2206,17 @@ static int load_global_roots(struct btrfs_root *tree_root) ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) - goto out; + return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); -out: - btrfs_free_path(path); + return ret; } @@ -2447,21 +2442,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* - * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. + * + * For 4K page sized systems with non-debug builds, all 3 matches (4K). + * For 4K page sized systems with debug builds, there are two block sizes + * supported. (4K and 2K) * * We can support 16K sectorsize with 64K page size without problem, * but such sectorsize/pagesize combination doesn't make much sense. * 4K will be our future standard, PAGE_SIZE is supported from the very * beginning. */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && + sectorsize != PAGE_SIZE && + sectorsize != BTRFS_MIN_BLOCKSIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2561,6 +2562,9 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (ret) + return ret; + ret = validate_sys_chunk_array(fs_info, sb); /* @@ -3390,7 +3394,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; @@ -3416,11 +3419,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) - btrfs_warn(fs_info, - "read-write for sector size %u with page size %lu is experimental", - sectorsize, PAGE_SIZE); - ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -4326,6 +4324,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we @@ -4346,6 +4352,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->delalloc_workers); /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* + * When finishing a compressed write bio we schedule a work queue item + * to finish an ordered extent - btrfs_finish_compressed_write_work() + * calls btrfs_finish_ordered_extent() which in turns does a call to + * btrfs_queue_ordered_fn(), and that queues the ordered extent + * completion either in the endio_write_workers work queue or in the + * fs_info->endio_freespace_worker work queue. We flush those queues + * below, so before we flush them we must flush this queue for the + * workers of compressed writes. + */ + flush_workqueue(fs_info->compressed_write_workers); + + /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we @@ -4369,6 +4400,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); btrfs_run_delayed_iputs(fs_info); + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); @@ -4403,9 +4436,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4528,10 +4558,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } @@ -4902,7 +4928,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) int btrfs_init_root_free_objectid(struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *l; struct btrfs_key search_key; @@ -4918,14 +4944,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) search_key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } if (path->slots[0] > 0) { slot = path->slots[0] - 1; @@ -4936,10 +4961,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) } else { root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index e2b22bea348a..7fc8a3200b40 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -75,7 +75,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; - struct inode *inode; + struct btrfs_inode *inode; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); @@ -89,12 +89,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation != 0 && generation != inode->i_generation) { - iput(inode); + if (generation != 0 && generation != inode->vfs_inode.i_generation) { + iput(&inode->vfs_inode); return ERR_PTR(-ESTALE); } - return d_obtain_alias(inode); + return d_obtain_alias(&inode->vfs_inode); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -145,9 +145,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = d_inode(child); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *dir = BTRFS_I(d_inode(child)); + struct btrfs_inode *inode; + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_root_ref *ref; @@ -159,13 +160,13 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (!path) return ERR_PTR(-ENOMEM); - if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { - key.objectid = btrfs_ino(BTRFS_I(dir)); + key.objectid = btrfs_ino(dir); key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; } @@ -210,7 +211,11 @@ struct dentry *btrfs_get_parent(struct dentry *child) found_key.offset, 0); } - return d_obtain_alias(btrfs_iget(key.objectid, root)); + inode = btrfs_iget(key.objectid, root); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_obtain_alias(&inode->vfs_inode); fail: btrfs_free_path(path); return ERR_PTR(ret); @@ -219,11 +224,11 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = d_inode(child); - struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(child)); + struct btrfs_inode *dir = BTRFS_I(d_inode(parent)); + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct extent_buffer *leaf; @@ -233,37 +238,34 @@ static int btrfs_get_name(struct dentry *parent, char *name, int ret; u64 ino; - if (!S_ISDIR(dir->i_mode)) + if (!S_ISDIR(dir->vfs_inode.i_mode)) return -EINVAL; - ino = btrfs_ino(BTRFS_I(inode)); + ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = btrfs_root_id(BTRFS_I(inode)->root); + key.objectid = btrfs_root_id(inode->root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { key.objectid = ino; - key.offset = btrfs_ino(BTRFS_I(dir)); key.type = BTRFS_INODE_REF_KEY; + key.offset = btrfs_ino(dir); } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - btrfs_free_path(path); return ret; } else if (ret > 0) { - if (ino == BTRFS_FIRST_FREE_OBJECTID) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) path->slots[0]--; - } else { - btrfs_free_path(path); + else return -ENOENT; - } } leaf = path->nodes[0]; @@ -280,7 +282,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, } read_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_free_path(path); /* * have to add the null termination to make sure that reconnect_path diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 6d08c100b01d..13de6af279e5 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -346,10 +346,10 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(const struct extent_io_tree *tree, - const struct extent_state *state, - const char *opname, - int err) +static void __cold extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { btrfs_panic(extent_io_tree_to_fs_info(tree), err, "extent io tree error on %s state start %llu end %llu", diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3014a1a23efd..957230abd827 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -70,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); - int ret; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; - key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - btrfs_free_path(path); - return ret; + key.offset = len; + return btrfs_search_slot(NULL, root, &key, path, 0, 0); } /* @@ -103,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 num_refs; u64 extent_flags; @@ -125,16 +122,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, search_again: key.objectid = bytenr; - key.offset = offset; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out_free; + return ret; if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { @@ -159,7 +156,7 @@ search_again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -170,7 +167,7 @@ search_again: "unexpected zero reference count for extent item (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } extent_flags = btrfs_extent_flags(leaf, ei); owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); @@ -216,8 +213,7 @@ search_again: *flags = extent_flags; if (owning_root) *owning_root = owner; -out_free: - btrfs_free_path(path); + return ret; } @@ -1487,7 +1483,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; @@ -1508,7 +1504,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) - goto out; + return ret; /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -1533,8 +1529,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, ret); -out: - btrfs_free_path(path); + return ret; } @@ -1631,7 +1626,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; @@ -1662,7 +1657,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - goto out; + return ret; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { @@ -1679,8 +1674,8 @@ again: metadata = 0; key.objectid = head->bytenr; - key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; goto again; } } else { @@ -1688,7 +1683,7 @@ again: btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", head->bytenr, head->num_bytes, head->level); - goto out; + return ret; } } @@ -1701,13 +1696,12 @@ again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); -out: - btrfs_free_path(path); + return ret; } @@ -2348,8 +2342,8 @@ static noinline int check_committed_ref(struct btrfs_inode *inode, int ret; key.objectid = bytenr; - key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) @@ -2874,7 +2868,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) block_group->length, &trimmed); + /* + * Not strictly necessary to lock, as the block_group should be + * read-only from btrfs_delete_unused_bgs(). + */ + ASSERT(block_group->ro); + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); @@ -5465,7 +5467,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_inline_ref *iref; int ret; bool exists = false; @@ -5482,7 +5484,6 @@ again: * If we get 0 then we found our reference, return 1, else * return the error if it's not -ENOENT; */ - btrfs_free_path(path); return (ret < 0 ) ? ret : 1; } @@ -5517,7 +5518,6 @@ again: mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); return exists ? 1 : 0; } @@ -6285,7 +6285,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct walk_control *wc; int level; int parent_level; @@ -6298,10 +6298,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); + if (!wc) return -ENOMEM; - } btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); @@ -6338,7 +6336,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, } kfree(wc); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index cfa52264f678..0ed682d9ed7b 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -4,7 +4,6 @@ #define BTRFS_EXTENT_TREE_H #include <linux/types.h> -#include "misc.h" #include "block-group.h" #include "locking.h" diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2fae67f8fa3..197f5e51c474 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -425,14 +425,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) folio_unlock(folio); else btrfs_folio_end_lock(fs_info, folio, start, len); @@ -488,11 +488,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio)); - btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); } /* @@ -753,7 +753,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, { struct btrfs_inode *inode = folio_to_inode(folio); - ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= folio_size(folio)); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && @@ -836,7 +836,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, if (folio->mapping) lockdep_assert_held(&folio->mapping->i_private_lock); - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { if (!folio_test_private(folio)) folio_attach_private(folio, eb); else @@ -870,7 +870,7 @@ int set_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) + if (btrfs_is_subpage(fs_info, folio)) return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); @@ -887,8 +887,8 @@ void clear_folio_extent_mapped(struct folio *folio) return; fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) - return btrfs_detach_subpage(fs_info, folio); + if (btrfs_is_subpage(fs_info, folio)) + return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_detach_private(folio); } @@ -935,16 +935,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = folio_pos(folio); - const u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + const u64 end = start + folio_size(folio) - 1; u64 extent_offset; u64 last_byte = i_size_read(inode); - u64 block_start; struct extent_map *em; int ret = 0; - size_t pg_offset = 0; - size_t iosize; - size_t blocksize = fs_info->sectorsize; + const size_t blocksize = fs_info->sectorsize; ret = set_folio_extent_mapped(folio); if (ret < 0) { @@ -955,25 +951,29 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); - if (zero_offset) { - iosize = folio_size(folio) - zero_offset; - folio_zero_range(folio, zero_offset, iosize); - } + if (zero_offset) + folio_zero_range(folio, zero_offset, + folio_size(folio) - zero_offset); } bio_ctrl->end_io_func = end_bbio_data_read; begin_folio_read(fs_info, folio); - while (cur <= end) { + for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + unsigned long pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false; u64 disk_bytenr; + u64 block_start; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = folio_size(folio) - pg_offset; - folio_zero_range(folio, pg_offset, iosize); - end_folio_read(folio, true, cur, iosize); + folio_zero_range(folio, pg_offset, end - cur + 1); + end_folio_read(folio, true, cur, end - cur + 1); break; } + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + end_folio_read(folio, true, cur, blocksize); + continue; + } em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); @@ -985,15 +985,15 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, compress_type = extent_map_compression(em); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else disk_bytenr = extent_map_block_start(em) + extent_offset; - block_start = extent_map_block_start(em); + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; + else + block_start = extent_map_block_start(em); /* * If we have a file range that points to a compressed extent @@ -1042,18 +1042,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - folio_zero_range(folio, pg_offset, iosize); - - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + folio_zero_range(folio, pg_offset, blocksize); + end_folio_read(folio, true, cur, blocksize); continue; } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + end_folio_read(folio, true, cur, blocksize); continue; } @@ -1064,15 +1059,190 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, pg_offset); - cur = cur + iosize; - pg_offset += iosize; } - return 0; } +/* + * Check if we can skip waiting the @ordered extent covering the block at @fileoff. + * + * @fileoff: Both input and output. + * Input as the file offset where the check should start at. + * Output as where the next check should start at, + * if the function returns true. + * + * Return true if we can skip to @fileoff. The caller needs to check the new + * @fileoff value to make sure it covers the full range, before skipping the + * full OE. + * + * Return false if we must wait for the ordered extent. + */ +static bool can_skip_one_ordered_range(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 *fileoff) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio; + const u32 blocksize = fs_info->sectorsize; + u64 cur = *fileoff; + bool ret; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); + + /* + * We should have locked the folio(s) for range [start, end], thus + * there must be a folio and it must be locked. + */ + ASSERT(!IS_ERR(folio)); + ASSERT(folio_test_locked(folio)); + + /* + * There are several cases for the folio and OE combination: + * + * 1) Folio has no private flag + * The OE has all its IO done but not yet finished, and folio got + * invalidated. + * + * Have we have to wait for the OE to finish, as it may contain the + * to-be-inserted data checksum. + * Without the data checksum inserted into the csum tree, read will + * just fail with missing csum. + */ + if (!folio_test_private(folio)) { + ret = false; + goto out; + } + + /* + * 2) The first block is DIRTY. + * + * This means the OE is created by some other folios whose file pos is + * before this one. And since we are holding the folio lock, the writeback + * of this folio cannot start. + * + * We must skip the whole OE, because it will never start until we + * finished our folio read and unlocked the folio. + */ + if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + ret = true; + /* + * At least inside the folio, all the remaining blocks should + * also be dirty. + */ + ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); + *fileoff = ordered->file_offset + ordered->num_bytes; + goto out; + } + + /* + * 3) The first block is uptodate. + * + * At least the first block can be skipped, but we are still not fully + * sure. E.g. if the OE has some other folios in the range that cannot + * be skipped. + * So we return true and update @next_ret to the OE/folio boundary. + */ + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * The whole range to the OE end or folio boundary should also + * be uptodate. + */ + ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); + ret = true; + *fileoff = cur + range_len; + goto out; + } + + /* + * 4) The first block is not uptodate. + * + * This means the folio is invalidated after the writeback was finished, + * but by some other operations (e.g. block aligned buffered write) the + * folio is inserted into filemap. + * Very much the same as case 1). + */ + ret = false; +out: + folio_put(folio); + return ret; +} + +static bool can_skip_ordered_extent(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 start, u64 end) +{ + const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); + u64 cur = max(start, ordered->file_offset); + + while (cur < range_end) { + bool can_skip; + + can_skip = can_skip_one_ordered_range(inode, ordered, &cur); + if (!can_skip) + return false; + } + return true; +} + +/* + * Locking helper to make sure we get a stable view of extent maps for the + * involved range. + * + * This is for folio read paths (read and readahead), thus the involved range + * should have all the folios locked. + */ +static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + u64 cur_pos; + + /* Caller must provide a valid @cached_state. */ + ASSERT(cached_state); + + /* The range must at least be page aligned, as all read paths are folio based. */ + ASSERT(IS_ALIGNED(start, PAGE_SIZE)); + ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); + +again: + lock_extent(&inode->io_tree, start, end, cached_state); + cur_pos = start; + while (cur_pos < end) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_range(inode, cur_pos, + end - cur_pos + 1); + /* + * No ordered extents in the range, and we hold the extent lock, + * no one can modify the extent maps in the range, we're safe to return. + */ + if (!ordered) + break; + + /* Check if we can skip waiting for the whole OE. */ + if (can_skip_ordered_extent(inode, ordered, start, end)) { + cur_pos = min(ordered->file_offset + ordered->num_bytes, + end + 1); + btrfs_put_ordered_extent(ordered); + continue; + } + + /* Now wait for the OE to finish. */ + unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); + btrfs_put_ordered_extent(ordered); + /* We have unlocked the whole range, restart from the beginning. */ + goto again; + } +} + int btrfs_read_folio(struct file *file, struct folio *folio) { struct btrfs_inode *inode = folio_to_inode(folio); @@ -1083,7 +1253,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); unlock_extent(&inode->io_tree, start, end, &cached_state); @@ -1105,7 +1275,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit unsigned int start_bit; unsigned int nbits; - ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; nbits = len >> fs_info->sectorsize_bits; ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); @@ -1118,12 +1288,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio, { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); const u64 folio_start = folio_pos(folio); - const unsigned int bitmap_size = fs_info->sectors_per_page; + const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); unsigned int start_bit; unsigned int first_zero; unsigned int first_set; - ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); @@ -1157,9 +1327,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); struct writeback_control *wbc = bio_ctrl->wbc; - const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const bool is_subpage = btrfs_is_subpage(fs_info, folio); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond @@ -1184,14 +1355,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ - if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->sectors_per_page > 1); + if (btrfs_is_subpage(fs_info, folio)) { + ASSERT(blocks_per_folio > 1); btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); } else { bio_ctrl->submit_bitmap = 1; } - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { u64 start = page_start + (bit << fs_info->sectorsize_bits); btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); @@ -1264,7 +1435,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(folio), - fs_info->sectors_per_page, + blocks_per_folio, &bio_ctrl->submit_bitmap, found_start, found_len, ret); } else { @@ -1309,7 +1480,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, unsigned int bitmap_size = min( (last_finished_delalloc_end - page_start) >> fs_info->sectorsize_bits, - fs_info->sectors_per_page); + blocks_per_folio); for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) btrfs_mark_ordered_io_finished(inode, folio, @@ -1324,7 +1495,7 @@ out: delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE + * we don't subtract one from PAGE_SIZE. */ delalloc_to_write += DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); @@ -1333,7 +1504,7 @@ out: * If all ranges are submitted asynchronously, we just need to account * for them here. */ - if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1434,6 +1605,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, bool submitted_io = false; bool error = false; const u64 folio_start = folio_pos(folio); + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; @@ -1442,21 +1614,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, start + len <= folio_start + folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); - if (ret) { + if (ret == -EAGAIN) { /* Fixup worker will requeue */ folio_redirty_for_writepage(bio_ctrl->wbc, folio); folio_unlock(folio); return 1; } + if (ret < 0) + return ret; for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, - fs_info->sectors_per_page); + blocks_per_folio); bio_ctrl->end_io_func = end_bbio_data_write; - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { @@ -1530,6 +1704,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl size_t pg_offset; loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); @@ -1551,6 +1726,30 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl * The proper bitmap can only be initialized until writepage_delalloc(). */ bio_ctrl->submit_bitmap = (unsigned long)-1; + + /* + * If the page is dirty but without private set, it's marked dirty + * without informing the fs. + * Nowadays that is a bug, since the introduction of + * pin_user_pages*(). + * + * So here we check if the page has private set to rule out such + * case. + * But we also have a long history of relying on the COW fixup, + * so here we only enable this check for experimental builds until + * we're sure it's safe. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && + unlikely(!folio_test_private(folio))) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + inode->root->root_key.objectid, + btrfs_ino(inode), folio_pos(folio)); + ret = -EUCLEAN; + goto done; + } + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; @@ -1562,14 +1761,14 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl goto done; ret = extent_writepage_io(inode, folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; if (ret < 0) btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", btrfs_root_id(inode->root), btrfs_ino(inode), - folio_pos(folio), fs_info->sectors_per_page, + folio_pos(folio), blocks_per_folio, &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; @@ -1725,20 +1924,13 @@ static struct extent_buffer *find_extent_buffer_nolock( static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; struct folio_iter fi; - u32 bio_offset = 0; if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { - u64 start = eb->start + bio_offset; - struct folio *folio = fi.folio; - u32 len = fi.length; - - btrfs_folio_clear_writeback(fs_info, folio, start, len); - bio_offset += len; + btrfs_meta_folio_clear_writeback(fi.folio, eb); } clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); @@ -1792,38 +1984,21 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; - if (fs_info->nodesize < PAGE_SIZE) { - struct folio *folio = eb->folios[0]; - bool ret; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; folio_lock(folio); - btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, - eb->len)) { - folio_clear_dirty_for_io(folio); - wbc->nr_to_write--; - } - ret = bio_add_folio(&bbio->bio, folio, eb->len, - eb->start - folio_pos(folio)); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->len); - folio_unlock(folio); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - bool ret; - - folio_lock(folio); - folio_clear_dirty_for_io(folio); - folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->folio_size); + btrfs_meta_folio_clear_dirty(folio, eb); + btrfs_meta_folio_set_writeback(folio, eb); + if (!folio_test_dirty(folio)) wbc->nr_to_write -= folio_nr_pages(folio); - folio_unlock(folio); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); + wbc_account_cgroup_owner(wbc, folio, range_len); + folio_unlock(folio); } btrfs_submit_bbio(bbio, 0); } @@ -1849,9 +2024,10 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) u64 folio_start = folio_pos(folio); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->sectors_per_page) { + while (bit_start < blocks_per_folio) { struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; @@ -1867,7 +2043,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) break; } spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, + if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&folio->mapping->i_private_lock); @@ -1933,7 +2109,7 @@ static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ct if (!folio_test_private(folio)) return 0; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); @@ -2192,10 +2368,8 @@ retry: done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor - * the page lock: the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to - * tmpfs file mapping + * the folio lock: the folio may be truncated or + * invalidated (changing folio->mapping to NULL). */ if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); @@ -2233,7 +2407,7 @@ retry: * regular submission. */ if (wbc->sync_mode != WB_SYNC_NONE || - btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { + btrfs_is_subpage(inode_to_fs_info(inode), folio)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2314,8 +2488,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); while (cur <= end) { - u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); - u32 cur_len = cur_end + 1 - cur; + u64 cur_end; + u32 cur_len; struct folio *folio; folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); @@ -2325,13 +2499,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f * code is just in case, but shouldn't actually be run. */ if (IS_ERR(folio)) { + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + cur_len = cur_end + 1 - cur; btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, cur, cur_len, false); mapping_set_error(mapping, PTR_ERR(folio)); - cur = cur_end + 1; + cur = cur_end; continue; } + cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end); + cur_len = cur_end + 1 - cur; + ASSERT(folio_test_locked(folio)); if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); @@ -2390,7 +2569,7 @@ void btrfs_readahead(struct readahead_control *rac) struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); @@ -2443,7 +2622,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; bool ret; if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { @@ -2481,7 +2660,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; @@ -2592,7 +2771,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo return; } - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -2618,7 +2797,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return; } @@ -2629,7 +2808,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * page range and no unfinished IO. */ if (!folio_range_has_eb(folio)) - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); spin_unlock(&folio->mapping->i_private_lock); } @@ -2662,15 +2841,14 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static struct extent_buffer * -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len) +static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb = NULL; eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; - eb->len = len; + eb->len = fs_info->nodesize; eb->fs_info = fs_info; init_rwsem(&eb->lock); @@ -2679,7 +2857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); + ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } @@ -2687,10 +2865,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { struct extent_buffer *new; - int num_folios = num_extent_folios(src); int ret; - new = __alloc_extent_buffer(src->fs_info, src->start, src->len); + new = __alloc_extent_buffer(src->fs_info, src->start); if (new == NULL) return NULL; @@ -2707,7 +2884,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(src); i++) { struct folio *folio = new->folios[i]; ret = attach_extent_buffer_folio(new, folio, NULL); @@ -2723,26 +2900,24 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return new; } -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - int num_folios = 0; int ret; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return NULL; ret = alloc_eb_folio_array(eb, false); if (ret) - goto err; + goto out; - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) - goto err; + goto out_detach; } set_extent_buffer_uptodate(eb); @@ -2750,23 +2925,19 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; -err: - for (int i = 0; i < num_folios; i++) { + +out_detach: + for (int i = 0; i < num_extent_folios(eb); i++) { if (eb->folios[i]) { detach_extent_buffer_folio(eb, eb->folios[i]); folio_put(eb->folios[i]); } } +out: kmem_cache_free(extent_buffer_cache, eb); return NULL; } -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) -{ - return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); -} - static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; @@ -2805,11 +2976,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_folios= num_extent_folios(eb); - check_buffer_tree_ref(eb); - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_mark_accessed(eb->folios[i]); } @@ -2842,10 +3011,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -2881,8 +3050,11 @@ again: free_eb: btrfs_release_extent_buffer(eb); return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, struct folio *folio) @@ -2896,7 +3068,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, * don't try to insert two ebs for the same bytenr. So here we always * return NULL and just continue. */ - if (fs_info->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(fs_info)) return NULL; /* Page not yet attached to an extent buffer */ @@ -2999,7 +3171,7 @@ retry: finish: spin_lock(&mapping->i_private_lock); - if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + if (existing_folio && btrfs_meta_is_subpage(fs_info)) { /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; @@ -3041,8 +3213,6 @@ finish: struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { - unsigned long len = fs_info->nodesize; - int num_folios; int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; @@ -3070,7 +3240,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return ERR_PTR(-ENOMEM); @@ -3090,8 +3260,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (btrfs_meta_is_subpage(fs_info)) { + prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); goto out; @@ -3106,9 +3276,8 @@ reallocate: goto out; } - num_folios = num_extent_folios(eb); /* Attach all pages to the filemap. */ - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio; ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); @@ -3148,7 +3317,7 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); /* * Check if the current page is physically contiguous with previous eb @@ -3159,7 +3328,7 @@ reallocate: if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) page_contig = false; - if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) + if (!btrfs_meta_folio_test_uptodate(folio, eb)) uptodate = 0; /* @@ -3202,7 +3371,7 @@ again: * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_unlock(eb->folios[i]); return eb; @@ -3233,7 +3402,7 @@ out: } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utilizing page->mapping. + * so it can be cleaned up without utilizing folio->mapping. */ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -3333,11 +3502,10 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_folio_dirty(struct folio *folio) +static void btree_clear_folio_dirty_tag(struct folio *folio) { - ASSERT(folio_test_dirty(folio)); + ASSERT(!folio_test_dirty(folio)); ASSERT(folio_test_locked(folio)); - folio_clear_dirty_for_io(folio); xa_lock_irq(&folio->mapping->i_pages); if (!folio_test_dirty(folio)) __xa_clear_mark(&folio->mapping->i_pages, @@ -3345,26 +3513,10 @@ static void btree_clear_folio_dirty(struct folio *folio) xa_unlock_irq(&folio->mapping->i_pages); } -static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct folio *folio = eb->folios[0]; - bool last; - - /* btree_clear_folio_dirty() needs page locked. */ - folio_lock(folio); - last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); - if (last) - btree_clear_folio_dirty(folio); - folio_unlock(folio); - WARN_ON(atomic_read(&eb->refs) == 0); -} - void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios; btrfs_assert_tree_write_locked(eb); @@ -3391,17 +3543,16 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - if (eb->fs_info->nodesize < PAGE_SIZE) - return clear_subpage_extent_buffer_dirty(eb); - - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; + bool last; if (!folio_test_dirty(folio)) continue; folio_lock(folio); - btree_clear_folio_dirty(folio); + last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); + if (last) + btree_clear_folio_dirty_tag(folio); folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); @@ -3409,37 +3560,34 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, void set_extent_buffer_dirty(struct extent_buffer *eb) { - int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->nodesize < PAGE_SIZE; + bool subpage = btrfs_meta_is_subpage(eb->fs_info); /* * For subpage case, we can have other extent buffers in the - * same page, and in clear_subpage_extent_buffer_dirty() we + * same page, and in clear_extent_buffer_dirty() we * have to clear page dirty without subpage lock held. * This can cause race where our page gets dirty cleared after * we just set it. * - * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * Thankfully, clear_extent_buffer_dirty() has locked * its page for other reasons, we can use page lock to prevent * the above race. */ if (subpage) folio_lock(eb->folios[0]); - for (int i = 0; i < num_folios; i++) - btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], - eb->start, eb->len); + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_dirty(eb->folios[i], eb); if (subpage) folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, @@ -3447,54 +3595,31 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; if (!folio) continue; - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_clear_uptodate(folio); - else - btrfs_subpage_clear_uptodate(fs_info, folio, - eb->start, eb->len); + btrfs_meta_folio_clear_uptodate(folio, eb); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_mark_uptodate(folio); - else - btrfs_subpage_set_uptodate(fs_info, folio, - eb->start, eb->len); - } + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_uptodate(eb->folios[i], eb); } static void clear_extent_buffer_reading(struct extent_buffer *eb) @@ -3507,10 +3632,7 @@ static void clear_extent_buffer_reading(struct extent_buffer *eb) static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct folio_iter fi; - u32 bio_offset = 0; /* * If the extent buffer is marked UPTODATE before the read operation @@ -3532,19 +3654,6 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); } - bio_for_each_folio_all(fi, &bbio->bio) { - struct folio *folio = fi.folio; - u64 start = eb->start + bio_offset; - u32 len = fi.length; - - if (uptodate) - btrfs_folio_set_uptodate(fs_info, folio, start, len); - else - btrfs_folio_clear_uptodate(fs_info, folio, start, len); - - bio_offset += len; - } - clear_extent_buffer_reading(eb); free_extent_buffer(eb); @@ -3555,7 +3664,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; - bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -3595,19 +3703,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); - if (eb->fs_info->nodesize < PAGE_SIZE) { - ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, - eb->start - folio_pos(eb->folios[0])); - ASSERT(ret); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); } btrfs_submit_bbio(bbio, mirror_num); return 0; @@ -3796,7 +3899,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; - if (fs_info->nodesize < PAGE_SIZE) { + if (btrfs_meta_is_subpage(fs_info)) { folio = eb->folios[0]; ASSERT(i == 0); if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, @@ -4282,7 +4385,7 @@ int try_release_extent_buffer(struct folio *folio) { struct extent_buffer *eb; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return try_release_subpage_extent_buffer(folio); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6c5328bfabc2..2e261892c7bc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -252,8 +252,6 @@ void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src); @@ -276,7 +274,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); -static inline int num_extent_pages(const struct extent_buffer *eb) +/* Note: this can be used in for loops without caching the value in a variable. */ +static inline int __pure num_extent_pages(const struct extent_buffer *eb) { /* * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to @@ -294,8 +293,10 @@ static inline int num_extent_pages(const struct extent_buffer *eb) * As we can have either one large folio covering the whole eb * (either nodesize <= PAGE_SIZE, or high order folio), or multiple * single-paged folios. + * + * Note: this can be used in for loops without caching the value in a variable. */ -static inline int num_extent_folios(const struct extent_buffer *eb) +static inline int __pure num_extent_folios(const struct extent_buffer *eb) { if (folio_order(eb->folios[0])) return 1; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d04a3b47b1fb..344b4db487a0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -163,20 +163,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, int ret = 0; struct btrfs_file_extent_item *item; struct btrfs_key file_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + file_key.objectid = objectid; - file_key.offset = pos; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = pos; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -190,8 +191,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, item, 0); btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); -out: - btrfs_free_path(path); + return ret; } @@ -212,8 +212,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; @@ -259,8 +259,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int cow = mod != 0; file_key.objectid = objectid; - file_key.offset = offset; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = offset; return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); } @@ -341,7 +341,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; @@ -373,10 +373,8 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); + if (!bbio->csum) return BLK_STS_RESOURCE; - } } else { bbio->csum = bbio->csum_inline; } @@ -444,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } - btrfs_free_path(path); return ret; } @@ -484,8 +481,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, path->nowait = nowait; key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = start; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -874,7 +871,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 end_byte = bytenr + len; u64 csum_end; @@ -892,8 +889,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = end_byte - 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -1010,7 +1007,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -1074,8 +1070,8 @@ again: found_next = 0; bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 0e13661a71f3..6181a70ec3ef 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,10 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/blk_types.h> #include <linux/list.h> #include <uapi/linux/btrfs_tree.h> +#include "ctree.h" #include "accessors.h" struct extent_map; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0b568c8d24cb..262a707d8990 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -804,14 +804,15 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; if (folio_test_uptodate(folio)) return 0; if (!force_uptodate && - IS_ALIGNED(clamp_start, PAGE_SIZE) && - IS_ALIGNED(clamp_end, PAGE_SIZE)) + IS_ALIGNED(clamp_start, blocksize) && + IS_ALIGNED(clamp_end, blocksize)) return 0; ret = btrfs_read_folio(NULL, folio); @@ -874,7 +875,6 @@ again: ret = PTR_ERR(folio); return ret; } - folio_wait_writeback(folio); /* Only support page sized folio yet. */ ASSERT(folio_order(folio) == 0); ret = set_folio_extent_mapped(folio); @@ -1014,8 +1014,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } - ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, nowait); + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else @@ -1783,6 +1782,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; + size_t fsize = folio_size(folio); vm_fault_t ret; int ret2; int reserved = 0; @@ -1793,7 +1793,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ASSERT(folio_order(folio) == 0); - reserved_space = PAGE_SIZE; + reserved_space = fsize; sb_start_pagefault(inode->i_sb); page_start = folio_pos(folio); @@ -1847,7 +1847,7 @@ again: * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); if (ordered) { unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); @@ -1859,11 +1859,11 @@ again: if (folio->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { + if (reserved_space < fsize) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - PAGE_SIZE - reserved_space, true); + fsize - reserved_space, true); } } @@ -1890,12 +1890,12 @@ again: if (page_start + folio_size(folio) > size) zero_start = offset_in_folio(folio, size); else - zero_start = PAGE_SIZE; + zero_start = fsize; - if (zero_start != PAGE_SIZE) + if (zero_start != fsize) folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); @@ -1904,7 +1904,7 @@ again: unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -1913,7 +1913,7 @@ out_unlock: folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index de89e644be29..d7df81388cbe 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -9,6 +9,8 @@ struct file; struct extent_state; struct kiocb; struct iov_iter; +struct inode; +struct folio; struct page; struct btrfs_ioctl_encoded_io_args; struct btrfs_drop_extents_args; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d42b6f882f57..05e173311c1a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct inode *inode = NULL; + struct btrfs_inode *inode; unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); - mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_constraint(inode->i_mapping, + mapping_set_gfp_mask(inode->vfs_inode.i_mapping, + mapping_gfp_constraint(inode->vfs_inode.i_mapping, ~(__GFP_FS | __GFP_HIGHMEM))); - return inode; + return &inode->vfs_inode; } struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, @@ -201,8 +201,8 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { @@ -244,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -257,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { if (PTR_ERR(inode) != -ENOENT) ret = PTR_ERR(inode); - goto out; + return ret; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(BTRFS_I(inode)); - goto out; + return ret; } clear_nlink(inode); /* One for the block groups ref */ @@ -285,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = 0; - goto out; + return ret; } - ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, trans->fs_info->tree_root, path); } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, @@ -447,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { - struct page *page; + struct folio *folio; struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -455,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) for (i = 0; i < io_ctl->num_pages; i++) { int ret; - page = find_or_create_page(inode->i_mapping, i, mask); - if (!page) { + folio = __filemap_get_folio(inode->i_mapping, i, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); + if (IS_ERR(folio)) { io_ctl_drop_pages(io_ctl); return -ENOMEM; } - ret = set_folio_extent_mapped(page_folio(page)); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); io_ctl_drop_pages(io_ctl); return ret; } - io_ctl->pages[i] = page; - if (uptodate && !PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != inode->i_mapping) { + io_ctl->pages[i] = &folio->page; + if (uptodate && !folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, "free space cache page truncated"); io_ctl_drop_pages(io_ctl); return -EIO; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); io_ctl_drop_pages(io_ctl); @@ -753,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -1156,8 +1155,8 @@ update_cache_item(struct btrfs_trans_handle *trans, int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index cae540ec15ed..39c6b96a4c25 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1062,7 +1062,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *extent_root; - struct btrfs_path *path, *path2; + BTRFS_PATH_AUTO_FREE(path); + BTRFS_PATH_AUTO_FREE(path2); struct btrfs_key key; u64 start, end; int ret; @@ -1070,17 +1071,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path2 = btrfs_alloc_path(); - if (!path2) { - btrfs_free_path(path); + if (!path2) return -ENOMEM; - } + + path->reada = READA_FORWARD; ret = add_new_free_space_info(trans, block_group, path2); if (ret) - goto out; + return ret; mutex_lock(&block_group->free_space_lock); @@ -1146,9 +1146,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = 0; out_locked: mutex_unlock(&block_group->free_space_lock); -out: - btrfs_free_path(path2); - btrfs_free_path(path); + return ret; } @@ -1217,7 +1215,7 @@ out_clear: static int clear_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int nr; int ret; @@ -1233,7 +1231,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; nr = btrfs_header_nritems(path->nodes[0]); if (!nr) @@ -1242,15 +1240,12 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1638,9 +1633,8 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u32 extent_count, flags; - int ret; block_group = caching_ctl->block_group; @@ -1657,10 +1651,9 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + if (IS_ERR(info)) + return PTR_ERR(info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -1670,11 +1663,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) * there. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) - ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + return load_free_space_bitmaps(caching_ctl, path, extent_count); else - ret = load_free_space_extents(caching_ctl, path, extent_count); - -out: - btrfs_free_path(path); - return ret; + return load_free_space_extents(caching_ctl, path, extent_count); } diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 09cfb43580cb..b2bb86f8d7cf 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "messages.h" -#include "ctree.h" #include "fs.h" #include "accessors.h" #include "volumes.h" diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index b572d6b9730b..bcca43046064 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -47,6 +47,18 @@ struct btrfs_subpage_info; struct btrfs_stripe_hash_table; struct btrfs_space_info; +/* + * Minimum data and metadata block size. + * + * Normally it's 4K, but for testing subpage block size on 4K page systems, we + * allow DEBUG builds to accept 2K page size. + */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_MIN_BLOCKSIZE (SZ_2K) +#else +#define BTRFS_MIN_BLOCKSIZE (SZ_4K) +#endif + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -105,6 +117,9 @@ enum { /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + /* No more delayed iput can be queued. */ + BTRFS_FS_STATE_NO_DELAYED_IPUT, + BTRFS_FS_STATE_COUNT }; @@ -485,8 +500,8 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; - unsigned long compress_type:4; - unsigned int compress_level; + int compress_type; + int compress_level; u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a @@ -709,7 +724,6 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - u32 sectors_per_page; struct workqueue_struct *scrub_workers; struct btrfs_discard_ctl discard_ctl; @@ -981,6 +995,12 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); } +static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info, + const struct folio *folio) +{ + return folio_size(folio) >> fs_info->sectorsize_bits; +} + bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 448aa1a682d6..3530de0618c8 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -191,8 +191,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -317,8 +317,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -493,8 +493,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path->reada = READA_BACK; key.objectid = control->ino; - key.offset = (u64)-1; key.type = (u8)-1; + key.offset = (u64)-1; search_again: /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 38756f8cef46..cc67d1a2d611 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -489,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t datasize; key.objectid = btrfs_ino(inode); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -566,23 +566,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (offset != 0) return false; - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (fs_info->sectorsize != PAGE_SIZE) - return false; - /* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) return false; + /* We do not allow a non-compressed extent to be as large as block size. */ + if (data_len >= fs_info->sectorsize) + return false; + /* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return false; @@ -672,7 +663,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -832,7 +823,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, btrfs_add_inode_defrag(inode, small_write); } -static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; struct folio *folio; @@ -840,13 +831,13 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - folio = filemap_get_folio(inode->i_mapping, index); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, end + 1 - start); folio_put(folio); } @@ -886,6 +877,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned int poff; int i; int compress_type = fs_info->compress_type; + int compress_level = fs_info->compress_level; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -894,7 +886,7 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(inode, start, end); /* * All the folios should have been locked thus no failure. @@ -968,13 +960,15 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) + if (inode->defrag_compress) { compress_type = inode->defrag_compress; - else if (inode->prop_compress) + compress_level = inode->defrag_compress_level; + } else if (inode->prop_compress) { compress_type = inode->prop_compress; + } /* Compression level is applied here. */ - ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + ret = btrfs_compress_folios(compress_type, compress_level, mapping, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) @@ -1090,7 +1084,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); if (locked_folio) btrfs_folio_end_lock(inode->root->fs_info, locked_folio, start, async_extent->ram_size); @@ -1272,10 +1265,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the - * while-loop, the ordered extents created in previous iterations are kept - * intact. So, the caller must clean them up by calling - * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for - * example. + * while-loop, the ordered extents created in previous iterations are cleaned up. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, @@ -1492,11 +1482,9 @@ out_unlock: /* * For the range (1). We have already instantiated the ordered extents - * for this region. They are cleaned up by - * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). + * for this region, thus we need to cleanup those ordered extents. * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV - * are also handled by the cleanup function. + * are also handled by the ordered extents cleanup. * * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and * finish the writeback of the involved folios, which will be never submitted. @@ -1507,6 +1495,8 @@ out_unlock: if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); + + btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_folio, NULL, clear_bits, page_ops); } @@ -1976,6 +1966,65 @@ static void cleanup_dirty_folios(struct btrfs_inode *inode, mapping_set_error(mapping, error); } +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1 << BTRFS_ORDERED_PREALLOC) + : (1 << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + /* + * On error, we need to cleanup the ordered extents we created. + * + * We do not clear the folio Dirty flags because they are set and + * cleaered by the caller. + */ + if (ret < 0) + btrfs_cleanup_ordered_extents(inode, file_pos, end); + return ret; +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -2020,15 +2069,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2154,75 +2200,21 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, found_key.offset - 1); - cow_start = (u64)-1; if (ret) { cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } + cow_start = (u64)-1; } - nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; - lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); - - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - struct extent_map *em; - - em = btrfs_create_io_em(inode, cur_offset, - &nocow_args.file_extent, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - &nocow_args.file_extent, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW)); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_folio, &cached_state, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; } btrfs_release_path(path); @@ -2231,11 +2223,11 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, end); - cow_start = (u64)-1; if (ret) { cow_end = end; goto error; } + cow_start = (u64)-1; } btrfs_free_path(path); @@ -2249,27 +2241,44 @@ error: * start cur_offset end * |/////////////| | * + * In this case, cow_start should be (u64)-1. + * * For range [start, cur_offset) the folios are already unlocked (except * @locked_folio), EXTENT_DELALLOC already removed. - * Only need to clear the dirty flag as they will never be submitted. - * Ordered extent and extent maps are handled by - * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * Need to clear the dirty flags and finish the ordered extents. + * + * 2) Failed with error before calling fallback_to_cow() + * + * start cow_start end + * |/////////////| | + * + * In this case, only @cow_start is set, @cur_offset is between + * [cow_start, end) * - * 2) Failed with error from fallback_to_cow() - * start cur_offset cow_end end + * It's mostly the same as case 1), just replace @cur_offset with + * @cow_start. + * + * 3) Failed with error from fallback_to_cow() + * + * start cow_start cow_end end * |/////////////|-----------| | * - * For range [start, cur_offset) it's the same as case 1). - * But for range [cur_offset, cow_end), the folios have dirty flag - * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * In this case, both @cow_start and @cow_end is set. * - * Thus we should not call extent_clear_unlock_delalloc() on range - * [cur_offset, cow_end), as the folios are already unlocked. + * For range [start, cow_start) it's the same as case 1). + * But for range [cow_start, cow_end), all the cleanup is handled by + * cow_file_range(), we should not touch anything in that range. * - * So clear the folio dirty flags for [start, cur_offset) first. + * So for all above cases, if @cow_start is set, cleanup ordered extents + * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). */ - if (cur_offset > start) + if (cow_start != (u64)-1) + cur_offset = cow_start; + + if (cur_offset > start) { + btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + } /* * If an error happened while a COW region is outstanding, cur_offset @@ -2334,7 +2343,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); - goto out; + return ret; } if (btrfs_inode_can_compress(inode) && @@ -2348,10 +2357,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol else ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); - -out: - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -2878,6 +2883,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio) return 0; /* + * For experimental build, we error out instead of EAGAIN. + * + * We should not hit such out-of-band dirty folios anymore. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + BTRFS_I(inode)->root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), + folio_pos(folio)); + return -EUCLEAN; + } + + /* * folio_checked is set below when we create a fixup worker for this * folio, don't try to create another one if we're already * folio_test_checked. @@ -2896,7 +2916,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the folio lock, and we can't trust - * page->mapping outside of the folio lock. + * folio->mapping outside of the folio lock. */ ihold(inode); btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); @@ -2952,8 +2972,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; + ins.offset = file_pos; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); @@ -2988,8 +3008,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; - ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = disk_num_bytes; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) @@ -3407,6 +3427,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode) if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; + WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq @@ -3527,7 +3548,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; - struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; @@ -3546,6 +3566,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -3669,10 +3691,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (!inode || inode->i_nlink) { + if (!inode || inode->vfs_inode.i_nlink) { if (inode) { - ret = btrfs_drop_verity_items(BTRFS_I(inode)); - iput(inode); + ret = btrfs_drop_verity_items(inode); + iput(&inode->vfs_inode); inode = NULL; if (ret) goto out; @@ -3695,7 +3717,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) nr_unlink++; /* this will do delete_inode and everything for us */ - iput(inode); + iput(&inode->vfs_inode); } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3845,12 +3867,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) +static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsigned long ptr; int maybe_acls; @@ -3859,7 +3882,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + ret = btrfs_init_file_extent_tree(inode); if (ret) goto out; @@ -3869,7 +3892,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) ASSERT(path); - btrfs_get_inode_key(BTRFS_I(inode), &location); + btrfs_get_inode_key(inode, &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3889,41 +3912,41 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); - i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - - inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + + inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); + inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); + inode->generation = btrfs_inode_generation(leaf, inode_item); + inode->last_trans = btrfs_inode_transid(leaf, inode_item); - inode_set_iversion_queried(inode, - btrfs_inode_sequence(leaf, inode_item)); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; + inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); + vfs_inode->i_generation = inode->generation; + vfs_inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); + btrfs_update_inode_mapping_flags(inode); cache_index: /* @@ -3935,9 +3958,8 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + if (inode->last_trans == btrfs_get_fs_generation(fs_info)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation @@ -3966,7 +3988,7 @@ cache_index: * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ - BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + inode->last_unlink_trans = inode->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation @@ -3974,15 +3996,15 @@ cache_index: * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + inode->last_reflink_trans = inode->last_trans; path->slots[0]++; - if (inode->i_nlink != 1 || + if (vfs_inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(BTRFS_I(inode))) + if (location.objectid != btrfs_ino(inode)) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3990,13 +4012,12 @@ cache_index: struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + inode->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, - extref); + inode->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* @@ -4004,50 +4025,49 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + btrfs_ino(inode), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), - btrfs_root_id(root), ret); + btrfs_ino(inode), btrfs_root_id(root), ret); } if (!maybe_acls) - cache_no_acl(inode); + cache_no_acl(vfs_inode); - switch (inode->i_mode & S_IFMT) { + switch (vfs_inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; + vfs_inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_fop = &btrfs_file_operations; + vfs_inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - inode->i_op = &btrfs_dir_inode_operations; + vfs_inode->i_fop = &btrfs_dir_file_operations; + vfs_inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(vfs_inode); + vfs_inode->i_mapping->a_ops = &btrfs_aops; break; default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); + vfs_inode->i_op = &btrfs_special_inode_operations; + init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + ret = btrfs_add_inode_to_root(inode, true); if (ret) goto out; return 0; out: - iget_failed(inode); + iget_failed(vfs_inode); return ret; } @@ -5602,7 +5622,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5614,40 +5634,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); - return inode; + if (!inode) + return NULL; + return BTRFS_I(inode); } /* * Get an inode object given its inode number and corresponding root. Path is * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path) +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { - struct inode *inode; + struct btrfs_inode *inode; int ret; inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } /* * Get an inode object given its inode number and corresponding root. */ -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_path *path; int ret; @@ -5655,7 +5677,7 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; path = btrfs_alloc_path(); @@ -5667,43 +5689,46 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } -static struct inode *new_simple_dir(struct inode *dir, - struct btrfs_key *key, - struct btrfs_root *root) +static struct btrfs_inode *new_simple_dir(struct inode *dir, + struct btrfs_key *key, + struct btrfs_root *root) { struct timespec64 ts; - struct inode *inode = new_inode(dir->i_sb); + struct inode *vfs_inode; + struct btrfs_inode *inode; - if (!inode) + vfs_inode = new_inode(dir->i_sb); + if (!vfs_inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = btrfs_grab_root(root); - BTRFS_I(inode)->ref_root_id = key->objectid; - set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); - set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + inode = BTRFS_I(vfs_inode); + inode->root = btrfs_grab_root(root); + inode->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags); + set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags); - btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); + btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ - inode->i_op = &simple_dir_inode_operations; - inode->i_opflags &= ~IOP_XATTR; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + vfs_inode->i_op = &simple_dir_inode_operations; + vfs_inode->i_opflags &= ~IOP_XATTR; + vfs_inode->i_fop = &simple_dir_operations; + vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - ts = inode_set_ctime_current(inode); - inode_set_mtime_to_ts(inode, ts); - inode_set_atime_to_ts(inode, inode_get_atime(dir)); - BTRFS_I(inode)->i_otime_sec = ts.tv_sec; - BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + ts = inode_set_ctime_current(vfs_inode); + inode_set_mtime_to_ts(vfs_inode, ts); + inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir)); + inode->i_otime_sec = ts.tv_sec; + inode->i_otime_nsec = ts.tv_nsec; - inode->i_uid = dir->i_uid; - inode->i_gid = dir->i_gid; + vfs_inode->i_uid = dir->i_uid; + vfs_inode->i_gid = dir->i_gid; return inode; } @@ -5717,15 +5742,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); -static inline u8 btrfs_inode_type(struct inode *inode) +static inline u8 btrfs_inode_type(const struct btrfs_inode *inode) { - return fs_umode_to_ftype(inode->i_mode); + return fs_umode_to_ftype(inode->vfs_inode.i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location = { 0 }; @@ -5742,18 +5767,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", - inode->i_mode, btrfs_inode_type(inode), + inode->vfs_inode.i_mode, btrfs_inode_type(inode), di_type); - iput(inode); + iput(&inode->vfs_inode); return ERR_PTR(-EUCLEAN); } - return inode; + return &inode->vfs_inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, @@ -5768,19 +5793,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) btrfs_put_root(sub_root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); down_read(&fs_info->cleanup_work_sem); - if (!sb_rdonly(inode->i_sb)) + if (!sb_rdonly(inode->vfs_inode.i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { - iput(inode); + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } } - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return &inode->vfs_inode; } static int btrfs_dentry_delete(const struct dentry *dentry) @@ -6253,7 +6281,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6339,6 +6367,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; + btrfs_update_inode_mapping_flags(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6432,7 +6461,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, path = NULL; if (args->subvol) { - struct inode *parent; + struct btrfs_inode *parent; /* * Subvolumes inherit properties from their parent subvolume, @@ -6442,11 +6471,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { - ret = btrfs_inode_inherit_props(trans, inode, parent); - iput(parent); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + parent); + iput(&parent->vfs_inode); } } else { - ret = btrfs_inode_inherit_props(trans, inode, dir); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + BTRFS_I(dir)); } if (ret) { btrfs_err(fs_info, @@ -6544,7 +6575,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, - btrfs_inode_type(&inode->vfs_inode), index); + btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6744,18 +6775,18 @@ fail: return err; } -static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - return btrfs_create_common(dir, dentry, inode); + return ERR_PTR(btrfs_create_common(dir, dentry, inode)); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6764,6 +6795,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, { int ret; struct extent_buffer *leaf = path->nodes[0]; + const u32 blocksize = leaf->fs_info->sectorsize; char *tmp; size_t max_size; unsigned long inline_size; @@ -6780,7 +6812,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_SIZE, max_size); + max_size = min_t(unsigned long, blocksize, max_size); ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); @@ -6792,14 +6824,15 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size < PAGE_SIZE) - folio_zero_range(folio, max_size, PAGE_SIZE - max_size); + if (max_size < blocksize) + folio_zero_range(folio, max_size, blocksize - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; @@ -6814,14 +6847,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio) if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, folio, fi); - copy_size = min_t(u64, PAGE_SIZE, + copy_size = min_t(u64, blocksize, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); - if (copy_size < PAGE_SIZE) - folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); + if (copy_size < blocksize) + folio_zero_range(folio, copy_size, blocksize - copy_size); return 0; } @@ -7062,17 +7095,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; struct extent_buffer *leaf; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; @@ -7082,8 +7115,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, return -ENOMEM; path->nowait = nowait; - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), offset, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); if (ret < 0) goto out; @@ -7098,7 +7131,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ goto out; @@ -7119,7 +7152,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.end = offset + *len - 1; nocow_args.free_path = true; - ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + ret = can_nocow_file_extent(path, &key, inode, &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; @@ -7135,7 +7168,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.file_extent.offset)) goto out; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + if (!(inode->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; @@ -7240,7 +7273,7 @@ static void wait_subpage_spinlock(struct folio *folio) struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7264,7 +7297,7 @@ static void wait_subpage_spinlock(struct folio *folio) static int btrfs_launder_folio(struct folio *folio) { return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), - PAGE_SIZE, NULL); + folio_size(folio), NULL); } static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -8499,8 +8532,6 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { - struct btrfs_inode *binode; - struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); @@ -8511,30 +8542,30 @@ static int start_delalloc_inodes(struct btrfs_root *root, spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - binode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + struct btrfs_inode *inode; + struct inode *tmp_inode; + + inode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); - list_move_tail(&binode->delalloc_inodes, - &root->delalloc_inodes); + list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && - test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags)) continue; - inode = igrab(&binode->vfs_inode); - if (!inode) { + tmp_inode = igrab(&inode->vfs_inode); + if (!tmp_inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) - set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, - &binode->runtime_flags); + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); if (full_flush) { - work = btrfs_alloc_delalloc_work(inode); + work = btrfs_alloc_delalloc_work(&inode->vfs_inode); if (!work) { - iput(inode); + iput(&inode->vfs_inode); ret = -ENOMEM; goto out; } @@ -8542,8 +8573,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -8660,7 +8691,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct extent_buffer *leaf; name_len = strlen(symname); - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + /* + * Symlinks utilize uncompressed inline extent data, which should not + * reach block size. + */ + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || + name_len >= fs_info->sectorsize) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); @@ -8699,8 +8735,8 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); @@ -9146,7 +9182,7 @@ out: } struct btrfs_encoded_read_private { - struct completion done; + struct completion *sync_reads; void *uring_ctx; refcount_t pending_refs; blk_status_t status; @@ -9158,11 +9194,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) if (bbio->bio.bi_status) { /* - * The memory barrier implied by the atomic_dec_return() here - * pairs with the memory barrier implied by the - * atomic_dec_return() or io_wait_event() in - * btrfs_encoded_read_regular_fill_pages() to ensure that this - * write is observed before the load of status in + * The memory barrier implied by the refcount_dec_and_test() here + * pairs with the memory barrier implied by the refcount_dec_and_test() + * in btrfs_encoded_read_regular_fill_pages() to ensure that + * this write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); @@ -9174,7 +9209,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - complete(&priv->done); + complete(priv->sync_reads); } } bio_put(&bbio->bio); @@ -9185,16 +9220,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private *priv; + struct btrfs_encoded_read_private *priv, sync_priv; + struct completion sync_reads; unsigned long i = 0; struct btrfs_bio *bbio; int ret; - priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); - if (!priv) - return -ENOMEM; + /* + * Fast path for synchronous reads which completes in this call, io_uring + * needs longer time span. + */ + if (uring_ctx) { + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + } else { + priv = &sync_priv; + init_completion(&sync_reads); + priv->sync_reads = &sync_reads; + } - init_completion(&priv->done); refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9237,11 +9282,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { if (!refcount_dec_and_test(&priv->pending_refs)) - wait_for_completion_io(&priv->done); + wait_for_completion_io(&sync_reads); /* See btrfs_encoded_read_endio() for ordering. */ - ret = blk_status_to_errno(READ_ONCE(priv->status)); - kfree(priv); - return ret; + return blk_status_to_errno(READ_ONCE(priv->status)); } } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6c18bad53cd3..a13d81bb56a0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 { #endif /* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, - unsigned int flags) +static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode, + unsigned int flags) { if (S_ISDIR(inode->i_mode)) return flags; @@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ -static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) +static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode) { unsigned int iflags = 0; - u32 flags = binode->flags; - u32 ro_flags = binode->ro_flags; + u32 flags = inode->flags; + u32 ro_flags = inode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; @@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) /* * Update inode->i_flags based on the btrfs internal flags. */ -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode) { - struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; - if (binode->flags & BTRFS_INODE_SYNC) + if (inode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; - if (binode->flags & BTRFS_INODE_IMMUTABLE) + if (inode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; - if (binode->flags & BTRFS_INODE_APPEND) + if (inode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; - if (binode->flags & BTRFS_INODE_NOATIME) + if (inode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; - if (binode->flags & BTRFS_INODE_DIRSYNC) + if (inode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; - if (binode->ro_flags & BTRFS_INODE_RO_VERITY) + if (inode->ro_flags & BTRFS_INODE_RO_VERITY) new_fl |= S_VERITY; - set_mask_bits(&inode->i_flags, + set_mask_bits(&inode->vfs_inode.i_flags, S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | S_VERITY, new_fl); } @@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags) return 0; } -static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, +static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info, unsigned int flags) { if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) @@ -248,24 +247,23 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_ */ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { - struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); - fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode)); return 0; } int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_inode *binode = BTRFS_I(inode); - struct btrfs_root *root = binode->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; - u32 binode_flags; + u32 inode_flags; if (btrfs_root_readonly(root)) return -EROFS; @@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; - fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode); + fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(inode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; @@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (ret) return ret; - binode_flags = binode->flags; + inode_flags = inode->flags; if (fsflags & FS_SYNC_FL) - binode_flags |= BTRFS_INODE_SYNC; + inode_flags |= BTRFS_INODE_SYNC; else - binode_flags &= ~BTRFS_INODE_SYNC; + inode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) - binode_flags |= BTRFS_INODE_IMMUTABLE; + inode_flags |= BTRFS_INODE_IMMUTABLE; else - binode_flags &= ~BTRFS_INODE_IMMUTABLE; + inode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) - binode_flags |= BTRFS_INODE_APPEND; + inode_flags |= BTRFS_INODE_APPEND; else - binode_flags &= ~BTRFS_INODE_APPEND; + inode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) - binode_flags |= BTRFS_INODE_NODUMP; + inode_flags |= BTRFS_INODE_NODUMP; else - binode_flags &= ~BTRFS_INODE_NODUMP; + inode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) - binode_flags |= BTRFS_INODE_NOATIME; + inode_flags |= BTRFS_INODE_NOATIME; else - binode_flags &= ~BTRFS_INODE_NOATIME; + inode_flags &= ~BTRFS_INODE_NOATIME; /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ if (!fa->flags_valid) { @@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } if (fsflags & FS_DIRSYNC_FL) - binode_flags |= BTRFS_INODE_DIRSYNC; + inode_flags |= BTRFS_INODE_DIRSYNC; else - binode_flags &= ~BTRFS_INODE_DIRSYNC; + inode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { - if (S_ISREG(inode->i_mode)) { + if (S_ISREG(inode->vfs_inode.i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ - if (inode->i_size == 0) - binode_flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + if (inode->vfs_inode.i_size == 0) + inode_flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } else { - binode_flags |= BTRFS_INODE_NODATACOW; + inode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ - if (S_ISREG(inode->i_mode)) { - if (inode->i_size == 0) - binode_flags &= ~(BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM); + if (S_ISREG(inode->vfs_inode.i_mode)) { + if (inode->vfs_inode.i_size == 0) + inode_flags &= ~(BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM); } else { - binode_flags &= ~BTRFS_INODE_NODATACOW; + inode_flags &= ~BTRFS_INODE_NODATACOW; } } @@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, * things smaller. */ if (fsflags & FS_NOCOMP_FL) { - binode_flags &= ~BTRFS_INODE_COMPRESS; - binode_flags |= BTRFS_INODE_NOCOMPRESS; + inode_flags &= ~BTRFS_INODE_COMPRESS; + inode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - if (IS_SWAPFILE(inode)) + if (IS_SWAPFILE(&inode->vfs_inode)) return -ETXTBSY; - binode_flags |= BTRFS_INODE_COMPRESS; - binode_flags &= ~BTRFS_INODE_NOCOMPRESS; + inode_flags |= BTRFS_INODE_COMPRESS; + inode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); } else { - binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } /* @@ -376,15 +374,14 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, return PTR_ERR(trans); if (comp) { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", - NULL, 0, 0); + ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -392,18 +389,19 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } update_flags: - binode->flags = binode_flags; + inode->flags = inode_flags; + btrfs_update_inode_mapping_flags(inode); btrfs_sync_inode_flags_to_i_flags(inode); - inode_inc_iversion(inode); - inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); + ret = btrfs_update_inode(trans, inode); out_end_trans: btrfs_end_transaction(trans); return ret; } -static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) +static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); } @@ -475,7 +473,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, * Calculate the number of transaction items to reserve for creating a subvolume * or snapshot, not including the inode, directory entries, or parent directory. */ -static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit) { /* * 1 to add root block @@ -617,8 +615,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; - key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { @@ -878,7 +876,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *child) + struct inode *dir, const struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; @@ -1033,17 +1031,14 @@ static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 new_size; u64 old_size; u64 devid = 1; - struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; - char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1111,6 +1106,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!strcmp(sizestr, "max")) new_size = bdev_nr_bytes(device->bdev); else { + char *retptr; + if (sizestr[0] == '-') { mod = -1; sizestr++; @@ -1158,6 +1155,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = round_down(new_size, fs_info->sectorsize); if (new_size > old_size) { + struct btrfs_trans_handle *trans; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -1336,15 +1335,15 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, +static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u64 flags = 0; - if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&fs_info->subvol_sem); @@ -1447,8 +1446,8 @@ out: return ret; } -static noinline int key_in_sk(struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk) +static noinline int key_in_sk(const struct btrfs_key *key, + const struct btrfs_ioctl_search_key *sk) { struct btrfs_key test; int ret; @@ -1473,7 +1472,7 @@ static noinline int key_in_sk(struct btrfs_key *key, static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk, + const struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, @@ -1530,8 +1529,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, } sh.objectid = key->objectid; - sh.offset = key->offset; sh.type = key->type; + sh.offset = key->offset; sh.len = item_len; sh.transid = found_transid; @@ -1604,13 +1603,12 @@ out: return ret; } -static noinline int search_ioctl(struct inode *inode, +static noinline int search_ioctl(struct btrfs_root *root, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = inode_to_fs_info(inode); - struct btrfs_root *root; + struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; struct btrfs_path *path; int ret; @@ -1627,9 +1625,10 @@ static noinline int search_ioctl(struct inode *inode, return -ENOMEM; if (sk->tree_id == 0) { - /* search the root of the inode that was passed */ - root = btrfs_grab_root(BTRFS_I(inode)->root); + /* Search the root that we got passed. */ + root = btrfs_grab_root(root); } else { + /* Look up the root from the arguments. */ root = btrfs_get_fs_root(info, sk->tree_id, true); if (IS_ERR(root)) { btrfs_free_path(path); @@ -1642,21 +1641,19 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { - ret = -EFAULT; /* * Ensure that the whole user buffer is faulted in at sub-page * granularity, otherwise the loop may live-lock. */ - if (fault_in_subpage_writeable(ubuf + sk_offset, - *buf_size - sk_offset)) + if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) { + ret = -EFAULT; break; + } ret = btrfs_search_forward(root, &key, path, sk->min_transid); - if (ret != 0) { - if (ret > 0) - ret = 0; - goto err; - } + if (ret) + break; + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); @@ -1664,16 +1661,17 @@ static noinline int search_ioctl(struct inode *inode, break; } + /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */ if (ret > 0) ret = 0; -err: + sk->nr_items = num_found; btrfs_put_root(root); btrfs_free_path(path); return ret; } -static noinline int btrfs_ioctl_tree_search(struct inode *inode, +static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args __user *uargs = argp; @@ -1689,7 +1687,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, buf_size = sizeof(uargs->buf); - ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + ret = search_ioctl(root, &sk, &buf_size, uargs->buf); /* * In the origin implementation an overflow is handled by returning a @@ -1703,7 +1701,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, +static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg = argp; @@ -1725,7 +1723,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, if (buf_size > buf_limit) buf_size = buf_limit; - ret = search_ioctl(inode, &args.key, &buf_size, + ret = search_ioctl(root, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; @@ -1833,7 +1831,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; - struct inode *temp_inode; char *ptr; int slot; int len; @@ -1861,6 +1858,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *temp_inode; + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out_put; @@ -1915,9 +1914,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(idmap, temp_inode, + ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); - iput(temp_inode); + iput(&temp_inode->vfs_inode); if (ret) { ret = -EACCES; goto out_put; @@ -2571,7 +2570,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } - ret = btrfs_defrag_file(file_inode(file), &file->f_ra, + ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; @@ -2763,7 +2762,7 @@ out_free: return ret; } -static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -2817,7 +2816,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, return ret; } -static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); @@ -4248,7 +4247,7 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, return 0; } -static int check_feature_bits(struct btrfs_fs_info *fs_info, +static int check_feature_bits(const struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) @@ -4384,7 +4383,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4415,7 +4414,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(inode, arg); + ret = btrfs_ioctl_send(root, arg); kfree(arg); return ret; } @@ -5242,7 +5241,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(inode, argp); + return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -5264,9 +5263,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(inode, argp); + return btrfs_ioctl_tree_search(root, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(inode, argp); + return btrfs_ioctl_tree_search_v2(root, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: @@ -5314,10 +5313,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); + return _btrfs_ioctl_send(root, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); + return _btrfs_ioctl_send(root, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ce915fcda43b..e08ea446cf48 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -9,6 +9,8 @@ struct file; struct dentry; struct mnt_idmap; struct fileattr; +struct io_uring_cmd; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_ioctl_balance_args; @@ -18,7 +20,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9a7a7b723305..81e62b652e21 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <asm/bug.h> #include <trace/events/btrfs.h> -#include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4aca7475fd82..03c945711003 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -842,10 +842,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, /* * Start IO and wait for a given ordered extent to finish. * - * Wait on page writeback for all the pages in the extent and the IO completion - * code to insert metadata into the btree corresponding to the extent. + * Wait on page writeback for all the pages in the extent but not in + * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the + * IO completion code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -865,8 +867,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { + if (!nowriteback_len) { + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + } else { + if (start < nowriteback_start) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, + nowriteback_start - 1); + if (nowriteback_start + nowriteback_len < end) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, + nowriteback_start + nowriteback_len, + end); + } + } if (!freespace_inode) btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4e152736d06c..1e6b0b182b29 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -17,6 +17,7 @@ struct inode; struct page; struct extent_state; +struct btrfs_block_group; struct btrfs_inode; struct btrfs_root; struct btrfs_fs_info; @@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len); +static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +{ + return btrfs_start_ordered_extent_nowriteback(entry, 0, 0); +} + int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8504bf1702c7..d0e620bf5f5a 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,6 +6,8 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +#include <linux/types.h> + /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b8fa34e16abb..adc956432d2f 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -26,8 +26,8 @@ struct prop_handler { const char *xattr_name; int (*validate)(const struct btrfs_inode *inode, const char *value, size_t len); - int (*apply)(struct inode *inode, const char *value, size_t len); - const char *(*extract)(const struct inode *inode); + int (*apply)(struct btrfs_inode *inode, const char *value, size_t len); + const char *(*extract)(const struct btrfs_inode *inode); bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, NULL, 0); + ret = handler->apply(inode, NULL, 0); ASSERT(ret == 0); return ret; @@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, value_len, flags); if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, value, value_len); + ret = handler->apply(inode, value, value_len); if (ret) { btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); @@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = handler->apply(inode, value, len); + ret = handler->apply(BTRFS_I(inode), value, len); if (unlikely(ret)) btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", @@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx, set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct btrfs_root *root = inode->root; + u64 ino = btrfs_ino(inode); - return iterate_object_props(root, path, ino, inode_prop_iterator, inode); + return iterate_object_props(root, path, ino, inode_prop_iterator, + &inode->vfs_inode); } static int prop_compression_validate(const struct btrfs_inode *inode, @@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode, return -EINVAL; } -static int prop_compression_apply(struct inode *inode, const char *value, +static int prop_compression_apply(struct btrfs_inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int type; /* Reset to defaults */ if (len == 0) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } /* Set NOCOMPRESS flag */ if ((len == 2 && strncmp("no", value, 2) == 0) || (len == 4 && strncmp("none", value, 4) == 0)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } @@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value, return -EINVAL; } - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = type; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; + inode->prop_compress = type; return 0; } @@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode) return false; } -static const char *prop_compression_extract(const struct inode *inode) +static const char *prop_compression_extract(const struct btrfs_inode *inode) { - switch (BTRFS_I(inode)->prop_compress) { + switch (inode->prop_compress) { case BTRFS_COMPRESS_ZLIB: case BTRFS_COMPRESS_LZO: case BTRFS_COMPRESS_ZSTD: - return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + return btrfs_compress_type2str(inode->prop_compress); default: break; } @@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = { }; int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, const struct inode *parent) + struct btrfs_inode *inode, + const struct btrfs_inode *parent) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; int i; bool need_reserve = false; - if (!test_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(parent)->runtime_flags)) + if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags)) return 0; for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { @@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; - if (h->ignore(BTRFS_I(inode))) + if (h->ignore(inode)) continue; value = h->extract(parent); @@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(BTRFS_I(inode), value, strlen(value)); + ret = h->validate(inode, value, strlen(value)); if (ret) continue; @@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return ret; } - ret = btrfs_setxattr(trans, inode, h->xattr_name, value, + ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value, strlen(value), 0); if (!ret) { ret = h->apply(inode, value, strlen(value)); if (ret) - btrfs_setxattr(trans, inode, h->xattr_name, + btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, NULL, 0, 0); else - set_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags); } if (need_reserve) { diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 63546d0a9444..15d9a025c923 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,9 +6,9 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H +#include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct btrfs_inode; struct btrfs_path; struct btrfs_trans_handle; @@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, const char *value, size_t value_len); bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path); int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - const struct inode *dir); + struct btrfs_inode *inode, + const struct btrfs_inode *dir); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f9d3766c809b..d6fa36674270 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -956,8 +956,8 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = 0; - key.offset = 0; key.type = 0; + key.offset = 0; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index e233cc79af18..a979fd59a4da 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args; struct btrfs_trans_handle; struct btrfs_delayed_ref_root; struct btrfs_inode; +struct btrfs_transaction; +struct btrfs_block_group; +struct btrfs_qgroup_swapped_blocks; /* * Btrfs qgroup overview diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 541836421778..69942ad43140 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <uapi/linux/btrfs_tree.h> #include "fs.h" +#include "accessors.h" #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f0824c948cb7..15c296cb4dac 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -165,7 +165,7 @@ out: * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ -static int clone_copy_inline_extent(struct inode *dst, +static int clone_copy_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, @@ -175,8 +175,8 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); - struct btrfs_root *root = BTRFS_I(dst)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; @@ -185,12 +185,12 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } - key.objectid = btrfs_ino(BTRFS_I(dst)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -205,7 +205,7 @@ static int clone_copy_inline_extent(struct inode *dst, goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + if (key.objectid == btrfs_ino(inode) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the @@ -214,7 +214,7 @@ static int clone_copy_inline_extent(struct inode *dst, ASSERT(key.offset > 0); goto copy_to_page; } - } else if (i_size_read(dst) <= datal) { + } else if (i_size_read(&inode->vfs_inode) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -236,7 +236,7 @@ copy_inline_extent: * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ - if (i_size_read(dst) > datal) { + if (i_size_read(&inode->vfs_inode) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to @@ -270,7 +270,7 @@ copy_inline_extent: drop_args.start = drop_start; drop_args.end = aligned_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); @@ -281,9 +281,9 @@ copy_inline_extent: btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - btrfs_set_inode_full_sync(BTRFS_I(dst)); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); + btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); + btrfs_set_inode_full_sync(inode); + ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); out: if (!ret && !trans) { /* @@ -318,7 +318,7 @@ copy_to_page: */ btrfs_release_path(path); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } @@ -526,7 +526,7 @@ process_slot: goto out; } - ret = clone_copy_inline_extent(inode, path, &new_key, + ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) @@ -617,26 +617,26 @@ out: return ret; } -static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { if (inode1 < inode2) swap(inode1, inode2); - down_write(&BTRFS_I(inode1)->i_mmap_lock); - down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); + down_write(&inode1->i_mmap_lock); + down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING); } -static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { - up_write(&BTRFS_I(inode1)->i_mmap_lock); - up_write(&BTRFS_I(inode2)->i_mmap_lock); + up_write(&inode1->i_mmap_lock); + up_write(&inode2->i_mmap_lock); } -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) +static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, + struct btrfs_inode *dst, u64 dst_loff) { const u64 end = dst_loff + len - 1; struct extent_state *cached_state = NULL; - struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + struct btrfs_fs_info *fs_info = src->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; @@ -646,9 +646,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ - lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); + lock_extent(&dst->io_tree, dst_loff, end, &cached_state); + ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, + ALIGN(len, bs), dst_loff, 1); + unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -678,8 +679,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN, + BTRFS_I(dst), dst_loff); if (ret) goto out; @@ -688,7 +689,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, } if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len, + BTRFS_I(dst), dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; @@ -775,24 +777,24 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; + struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in)); + struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out)); + u64 bs = inode_out->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + struct btrfs_root *root_out = inode_out->root; if (btrfs_root_readonly(root_out)) return -EROFS; - ASSERT(inode_in->i_sb == inode_out->i_sb); + ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } /* Don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + if ((inode_in->flags & BTRFS_INODE_NODATASUM) != + (inode_out->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } @@ -811,7 +813,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); @@ -832,16 +834,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ - ret = filemap_flush(inode_in->i_mapping); + ret = filemap_flush(inode_in->vfs_inode.i_mapping); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; @@ -863,8 +863,8 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); + struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file)); + struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file)); bool same_inode = dst_inode == src_inode; int ret; @@ -872,9 +872,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); } else { - lock_two_nondirectories(src_inode, dst_inode); + lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode); btrfs_double_mmap_lock(src_inode, dst_inode); } @@ -884,16 +884,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + ret = btrfs_extent_same(&src_inode->vfs_inode, off, len, + &dst_inode->vfs_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: if (same_inode) { - btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); - unlock_two_nondirectories(src_inode, dst_inode); + unlock_two_nondirectories(&src_inode->vfs_inode, + &dst_inode->vfs_inode); } /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index af0969b70b53..f948f4f6431c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3239,21 +3239,23 @@ out: return ret; } -static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *block_group, +static int delete_block_group_cache(struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; + struct btrfs_inode *btrfs_inode; int ret = 0; if (inode) goto truncate; - inode = btrfs_iget(ino, root); - if (IS_ERR(inode)) + btrfs_inode = btrfs_iget(ino, root); + if (IS_ERR(btrfs_inode)) return -ENOENT; + inode = &btrfs_inode->vfs_inode; truncate: ret = btrfs_check_trunc_cache_free_space(fs_info, @@ -3313,8 +3315,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, } if (!found) return -ENOENT; - ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, - space_cache_ino); + ret = delete_block_group_cache(block_group, NULL, space_cache_ino); return ret; } @@ -3761,10 +3762,10 @@ out: * the inode is in data relocation tree and its link count is 0 */ static noinline_for_stack struct inode *create_reloc_inode( - struct btrfs_fs_info *fs_info, const struct btrfs_block_group *group) { - struct inode *inode = NULL; + struct btrfs_fs_info *fs_info = group->fs_info; + struct btrfs_inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; @@ -3792,18 +3793,19 @@ static noinline_for_stack struct inode *create_reloc_inode( inode = NULL; goto out; } - BTRFS_I(inode)->reloc_block_group_start = group->start; + inode->reloc_block_group_start = group->start; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (ret) { - iput(inode); + if (inode) + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } - return inode; + return &inode->vfs_inode; } /* @@ -3977,7 +3979,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); + ret = delete_block_group_cache(rc->block_group, inode, 0); else ret = PTR_ERR(inode); @@ -3986,7 +3988,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + rc->data_inode = create_reloc_inode(rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); rc->data_inode = NULL; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 531312efee8d..2c5edcee9450 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1380,11 +1380,11 @@ static int find_first_extent_item(struct btrfs_root *extent_root, if (path->nodes[0]) goto search_forward; + key.objectid = search_start; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = search_start; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -2497,8 +2497,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, path->skip_locking = 1; key.objectid = scrub_dev->devid; - key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0ull; while (1) { u64 dev_extent_len; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f437138fefbc..0c8c58c4f29b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -16,7 +16,6 @@ #include <linux/compat.h> #include <linux/crc32c.h> #include <linux/fsverity.h> - #include "send.h" #include "ctree.h" #include "backref.h" @@ -178,6 +177,7 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; @@ -425,15 +425,21 @@ static int need_send_hole(struct send_ctx *sctx) static void fs_path_reset(struct fs_path *p) { - if (p->reversed) { + if (p->reversed) p->start = p->buf + p->buf_len - 1; - p->end = p->start; - *p->start = 0; - } else { + else p->start = p->buf; - p->end = p->start; - *p->start = 0; - } + + p->end = p->start; + *p->start = 0; +} + +static void init_path(struct fs_path *p) +{ + p->reversed = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); } static struct fs_path *fs_path_alloc(void) @@ -443,10 +449,7 @@ static struct fs_path *fs_path_alloc(void) p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; - p->reversed = 0; - p->buf = p->inline_buf; - p->buf_len = FS_PATH_INLINE_SIZE; - fs_path_reset(p); + init_path(p); return p; } @@ -471,7 +474,7 @@ static void fs_path_free(struct fs_path *p) kfree(p); } -static int fs_path_len(struct fs_path *p) +static inline int fs_path_len(const struct fs_path *p) { return p->end - p->start; } @@ -487,12 +490,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; - path_len = p->end - p->start; + path_len = fs_path_len(p); old_buf_len = p->buf_len; /* @@ -533,12 +534,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, int ret; int new_len; - new_len = p->end - p->start + name_len; + new_len = fs_path_len(p) + name_len; if (p->start != p->end) new_len++; ret = fs_path_ensure_buf(p, new_len); if (ret < 0) - goto out; + return ret; if (p->reversed) { if (p->start != p->end) @@ -553,8 +554,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, *p->end = 0; } -out: - return ret; + return 0; } static int fs_path_add(struct fs_path *p, const char *name, int name_len) @@ -564,25 +564,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len) ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) - goto out; + return ret; memcpy(prepared, name, name_len); -out: - return ret; + return 0; } -static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2) { - int ret; - char *prepared; - - ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); - if (ret < 0) - goto out; - memcpy(prepared, p2->start, p2->end - p2->start); - -out: - return ret; + return fs_path_add(p, p2->start, fs_path_len(p2)); } static int fs_path_add_from_extent_buffer(struct fs_path *p, @@ -594,12 +584,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p, ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) - goto out; + return ret; read_extent_buffer(eb, prepared, off, len); -out: - return ret; + return 0; } static int fs_path_copy(struct fs_path *p, struct fs_path *from) @@ -619,13 +608,21 @@ static void fs_path_unreverse(struct fs_path *p) return; tmp = p->start; - len = p->end - p->start; + len = fs_path_len(p); p->start = p->buf; p->end = p->start + len; memmove(p->start, tmp, len + 1); p->reversed = 0; } +static inline bool is_current_inode_path(const struct send_ctx *sctx, + const struct fs_path *path) +{ + const struct fs_path *cur = &sctx->cur_inode_path; + + return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0); +} + static struct btrfs_path *alloc_path_for_send(void) { struct btrfs_path *path; @@ -740,7 +737,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, #define TLV_PUT_PATH(sctx, attrtype, p) \ do { \ ret = tlv_put_string(sctx, attrtype, p->start, \ - p->end - p->start); \ + fs_path_len((p))); \ if (ret < 0) \ goto tlv_put_failure; \ } while(0) @@ -826,7 +823,7 @@ static int send_rename(struct send_ctx *sctx, ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); @@ -834,7 +831,6 @@ static int send_rename(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -851,7 +847,7 @@ static int send_link(struct send_ctx *sctx, ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); @@ -859,7 +855,6 @@ static int send_link(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -875,14 +870,13 @@ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -898,14 +892,13 @@ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -1897,7 +1890,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; if (send_gen) @@ -1908,7 +1901,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, } else { ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; if (parent_gen) @@ -1953,7 +1946,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = -ENOENT; } -out: return ret; } @@ -1967,17 +1959,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) - goto out; + return ret; if (ret == inode_state_no_change || ret == inode_state_did_create || ret == inode_state_will_delete) - ret = 1; - else - ret = 0; + return 1; -out: - return ret; + return 0; } /* @@ -2326,9 +2315,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); if (ret < 0) - goto out; - ret = nce->ret; - goto out; + return ret; + return nce->ret; } } @@ -2339,12 +2327,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, */ ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) - goto out; + return ret; if (!ret) { ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; goto out_cache; } @@ -2360,21 +2348,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, ret = get_first_ref(sctx->parent_root, ino, parent_ino, parent_gen, dest); if (ret < 0) - goto out; + return ret; /* * Check if the ref was overwritten by an inode's ref that was processed * earlier. If yes, treat as orphan and return 1. */ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, - dest->start, dest->end - dest->start); + dest->start, fs_path_len(dest)); if (ret < 0) - goto out; + return ret; if (ret) { fs_path_reset(dest); ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; } @@ -2383,10 +2371,8 @@ out_cache: * Store the result of the lookup in the name cache. */ nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); - if (!nce) { - ret = -ENOMEM; - goto out; - } + if (!nce) + return -ENOMEM; nce->entry.key = ino; nce->entry.gen = gen; @@ -2404,10 +2390,9 @@ out_cache: nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); if (nce_ret < 0) { kfree(nce); - ret = nce_ret; + return nce_ret; } -out: return ret; } @@ -2444,6 +2429,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; + const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen); + + if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) { + if (dest != &sctx->cur_inode_path) + return fs_path_copy(dest, &sctx->cur_inode_path); + + return 0; + } name = fs_path_alloc(); if (!name) { @@ -2495,8 +2488,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, out: fs_path_free(name); - if (!ret) + if (!ret) { fs_path_unreverse(dest); + if (is_cur_inode && dest != &sctx->cur_inode_path) + ret = fs_path_copy(&sctx->cur_inode_path, dest); + } + return ret; } @@ -2591,6 +2588,47 @@ out: return ret; } +static struct fs_path *get_cur_inode_path(struct send_ctx *sctx) +{ + if (fs_path_len(&sctx->cur_inode_path) == 0) { + int ret; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + &sctx->cur_inode_path); + if (ret < 0) + return ERR_PTR(ret); + } + + return &sctx->cur_inode_path; +} + +static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen) +{ + struct fs_path *path; + int ret; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + return get_cur_inode_path(sctx); + + path = fs_path_alloc(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = get_cur_path(sctx, ino, gen, path); + if (ret < 0) { + fs_path_free(path); + return ERR_PTR(ret); + } + + return path; +} + +static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path) +{ + if (path != &sctx->cur_inode_path) + fs_path_free(path); +} + static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; @@ -2599,17 +2637,14 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); @@ -2617,7 +2652,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2629,17 +2664,14 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); @@ -2647,7 +2679,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2662,17 +2694,14 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); @@ -2680,7 +2709,7 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2693,17 +2722,14 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", ino, uid, gid); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); @@ -2712,7 +2738,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2729,9 +2755,9 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) btrfs_debug(fs_info, "send_utimes %llu", ino); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); path = alloc_path_for_send(); if (!path) { @@ -2756,9 +2782,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); @@ -2770,7 +2793,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); btrfs_free_path(path); return ret; } @@ -3106,6 +3129,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, goto out; ret = send_rename(sctx, path, orphan); + if (ret < 0) + goto out; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + ret = fs_path_copy(&sctx->cur_inode_path, orphan); out: fs_path_free(orphan); @@ -4158,6 +4186,23 @@ out: return ret; } +static int rename_current_inode(struct send_ctx *sctx, + struct fs_path *current_path, + struct fs_path *new_path) +{ + int ret; + + ret = send_rename(sctx, current_path, new_path); + if (ret < 0) + return ret; + + ret = fs_path_copy(&sctx->cur_inode_path, new_path); + if (ret < 0) + return ret; + + return fs_path_copy(current_path, new_path); +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -4172,9 +4217,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) u64 ow_inode = 0; u64 ow_gen; u64 ow_mode; - int did_overwrite = 0; - int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool did_overwrite = false; + bool is_orphan = false; bool can_rename = true; bool orphanized_dir = false; bool orphanized_ancestor = false; @@ -4216,14 +4261,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (ret) - did_overwrite = 1; + did_overwrite = true; } if (sctx->cur_inode_new || did_overwrite) { ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } else { ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4348,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret > 0) { orphanized_ancestor = true; fs_path_reset(valid_path); + fs_path_reset(&sctx->cur_inode_path); ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4443,13 +4489,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * it depending on the inode mode. */ if (is_orphan && can_rename) { - ret = send_rename(sctx, valid_path, cur->full_path); - if (ret < 0) - goto out; - is_orphan = 0; - ret = fs_path_copy(valid_path, cur->full_path); + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; + is_orphan = false; } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* @@ -4457,10 +4500,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -4507,7 +4547,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } list_for_each_entry(cur, &sctx->deleted_refs, list) { @@ -4553,6 +4593,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; + if (is_current_inode_path(sctx, cur->full_path)) + fs_path_reset(&sctx->cur_inode_path); } ret = dup_ref(cur, &check_dirs); if (ret < 0) @@ -4701,7 +4743,7 @@ out: static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4710,7 +4752,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4724,13 +4766,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) &sctx->new_refs, name, dir, dir_gen, sctx); } -out: + return ret; } static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4739,7 +4781,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4753,7 +4795,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx &sctx->deleted_refs, name, dir, dir_gen, sctx); } -out: + return ret; } @@ -4764,11 +4806,9 @@ static int record_new_ref(struct send_ctx *sctx) ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_deleted_ref(struct send_ctx *sctx) @@ -4779,29 +4819,25 @@ static int record_deleted_ref(struct send_ctx *sctx) sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_changed_ref(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; + return ret; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } /* @@ -4869,15 +4905,19 @@ out: } static int send_set_xattr(struct send_ctx *sctx, - struct fs_path *path, const char *name, int name_len, const char *data, int data_len) { - int ret = 0; + struct fs_path *path; + int ret; + + path = get_cur_inode_path(sctx); + if (IS_ERR(path)) + return PTR_ERR(path); ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4886,7 +4926,6 @@ static int send_set_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4894,11 +4933,11 @@ static int send_remove_xattr(struct send_ctx *sctx, struct fs_path *path, const char *name, int name_len) { - int ret = 0; + int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4906,7 +4945,6 @@ static int send_remove_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4914,19 +4952,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; - struct fs_path *p; struct posix_acl_xattr_header dummy_acl; /* Capabilities are emitted by finish_inode_if_needed */ if (!strncmp(name, XATTR_NAME_CAPS, name_len)) return 0; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - /* * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte @@ -4943,48 +4975,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, } } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_set_xattr(sctx, p, name, name_len, data, data_len); - -out: - fs_path_free(p); - return ret; + return send_set_xattr(sctx, name, name_len, data, data_len); } static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_remove_xattr(sctx, p, name, name_len); + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); -out: - fs_path_free(p); - return ret; + return send_remove_xattr(sctx, p, name, name_len); } static int process_new_xattr(struct send_ctx *sctx) { - int ret = 0; - - ret = iterate_dir_item(sctx->send_root, sctx->left_path, - __process_new_xattr, sctx); - - return ret; + return iterate_dir_item(sctx->send_root, sctx->left_path, + __process_new_xattr, sctx); } static int process_deleted_xattr(struct send_ctx *sctx) @@ -5100,17 +5111,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, static int process_changed_xattr(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_dir_item(sctx->send_root, sctx->left_path, __process_changed_new_xattr, sctx); if (ret < 0) - goto out; - ret = iterate_dir_item(sctx->parent_root, sctx->right_path, - __process_changed_deleted_xattr, sctx); + return ret; -out: - return ret; + return iterate_dir_item(sctx->parent_root, sctx->right_path, + __process_changed_deleted_xattr, sctx); } static int process_all_new_xattrs(struct send_ctx *sctx) @@ -5157,7 +5166,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, @@ -5172,21 +5181,20 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } static int process_verity(struct send_ctx *sctx) { int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *p; inode = btrfs_iget(sctx->cur_ino, sctx->send_root); if (IS_ERR(inode)) return PTR_ERR(inode); - ret = btrfs_get_verity_descriptor(inode, NULL, 0); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0); if (ret < 0) goto iput; @@ -5203,27 +5211,19 @@ static int process_verity(struct send_ctx *sctx) } } - ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret); if (ret < 0) goto iput; - p = fs_path_alloc(); - if (!p) { - ret = -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) { + ret = PTR_ERR(p); goto iput; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto free_path; ret = send_verity(sctx, p, sctx->verity_descriptor); - if (ret < 0) - goto free_path; - -free_path: - fs_path_free(p); iput: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5343,31 +5343,25 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); - if (ret < 0) - goto out; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); ret = put_file_data(sctx, offset, len); if (ret < 0) - goto out; + return ret; ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5380,6 +5374,7 @@ static int send_clone(struct send_ctx *sctx, { int ret = 0; struct fs_path *p; + struct fs_path *cur_inode_path; u64 gen; btrfs_debug(sctx->send_root->fs_info, @@ -5387,6 +5382,10 @@ static int send_clone(struct send_ctx *sctx, offset, len, btrfs_root_id(clone_root->root), clone_root->ino, clone_root->offset); + cur_inode_path = get_cur_inode_path(sctx); + if (IS_ERR(cur_inode_path)) + return PTR_ERR(cur_inode_path); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -5395,13 +5394,9 @@ static int send_clone(struct send_ctx *sctx, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); - TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path); if (clone_root->root == sctx->send_root) { ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); @@ -5452,17 +5447,13 @@ static int send_update_extent(struct send_ctx *sctx, int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); @@ -5471,8 +5462,6 @@ static int send_update_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5501,12 +5490,10 @@ static int send_hole(struct send_ctx *sctx, u64 end) if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto tlv_put_failure; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); + while (offset < end) { u64 len = min(end - offset, read_size); @@ -5527,7 +5514,6 @@ static int send_hole(struct send_ctx *sctx, u64 end) } sctx->cur_inode_next_write_offset = offset; tlv_put_failure: - fs_path_free(p); return ret; } @@ -5535,9 +5521,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, struct btrfs_path *path, u64 offset, u64 len) { - struct btrfs_root *root = sctx->send_root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5546,23 +5530,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, size_t inline_size; int ret; - inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; - goto out; - } + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; + return ret; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -5578,12 +5552,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, ei)); if (ret < 0) - goto out; + return ret; TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); ret = put_data_header(sctx, inline_size); if (ret < 0) - goto out; + return ret; read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, btrfs_file_extent_inline_start(ei), inline_size); sctx->send_size += inline_size; @@ -5591,9 +5565,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(fspath); - iput(inode); return ret; } @@ -5602,7 +5573,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5617,9 +5588,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (IS_ERR(inode)) return PTR_ERR(inode); - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) { + ret = PTR_ERR(fspath); goto out; } @@ -5627,10 +5598,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); @@ -5672,7 +5639,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_num_bytes, sctx->send_buf_pages + (data_offset >> PAGE_SHIFT), @@ -5698,8 +5665,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, tlv_put_failure: out: - fs_path_free(fspath); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5741,15 +5707,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, } if (sctx->cur_inode == NULL) { + struct btrfs_inode *btrfs_inode; struct btrfs_root *root = sctx->send_root; - sctx->cur_inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(sctx->cur_inode)) { - int err = PTR_ERR(sctx->cur_inode); + btrfs_inode = btrfs_iget(sctx->cur_ino, root); + if (IS_ERR(btrfs_inode)) + return PTR_ERR(btrfs_inode); - sctx->cur_inode = NULL; - return err; - } + sctx->cur_inode = &btrfs_inode->vfs_inode; memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); @@ -5828,7 +5793,6 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct fs_path *fspath = NULL; struct btrfs_path *path; struct btrfs_dir_item *di; struct extent_buffer *leaf; @@ -5854,25 +5818,19 @@ static int send_capabilities(struct send_ctx *sctx) leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); - fspath = fs_path_alloc(); buf = kmalloc(buf_len, GFP_KERNEL); - if (!fspath || !buf) { + if (!buf) { ret = -ENOMEM; goto out; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); - ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - fs_path_free(fspath); btrfs_free_path(path); return ret; } @@ -6898,6 +6856,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; + fs_path_reset(&sctx->cur_inode_path); /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -8107,10 +8066,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) btrfs_root_id(root), root->dedupe_in_progress); } -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = inode->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; @@ -8173,6 +8131,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a goto out; } + init_path(&sctx->cur_inode_path); INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); @@ -8449,6 +8408,9 @@ out: btrfs_lru_cache_clear(&sctx->dir_created_cache); btrfs_lru_cache_clear(&sctx->dir_utimes_cache); + if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf) + kfree(sctx->cur_inode_path.buf); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 9309886c5ea1..652bb28f63d4 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -11,7 +11,7 @@ #include <linux/sizes.h> #include <linux/align.h> -struct btrfs_inode; +struct btrfs_root; struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" @@ -182,6 +182,6 @@ enum { __BTRFS_SEND_A_MAX = 35, }; -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index a341d087567a..ff089e3e4103 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "linux/spinlock.h" +#include <linux/spinlock.h> #include <linux/minmax.h> #include "misc.h" #include "ctree.h" diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 722acf768396..11dbd7be6a3b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -2,12 +2,11 @@ #include <linux/slab.h> #include "messages.h" -#include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" /* - * Subpage (sectorsize < PAGE_SIZE) support overview: + * Subpage (block size < folio size) support overview: * * Limitations: * @@ -64,35 +63,14 @@ * This means a slightly higher tree locking latency. */ -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) -{ - if (fs_info->sectorsize >= PAGE_SIZE) - return false; - - /* - * Only data pages (either through DIO or compression) can have no - * mapping. And if page->mapping->host is data inode, it's subpage. - * As we have ruled our sectorsize >= PAGE_SIZE case already. - */ - if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host))) - return true; - - /* - * Now the only remaining case is metadata, which we only go subpage - * routine if nodesize < PAGE_SIZE. - */ - if (fs_info->nodesize < PAGE_SIZE) - return true; - return false; -} -#endif - int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; + /* For metadata we don't support large folio yet. */ + ASSERT(!folio_test_large(folio)); + /* * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. @@ -101,10 +79,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) + if (folio_test_private(folio)) + return 0; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return 0; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return 0; - subpage = btrfs_alloc_subpage(fs_info, type); + subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type); if (IS_ERR(subpage)) return PTR_ERR(subpage); @@ -112,12 +94,17 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) + if (!folio_test_private(folio)) + return; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_detach_private(folio); @@ -126,15 +113,16 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol } struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type) + size_t fsize, enum btrfs_subpage_type type) { struct btrfs_subpage *ret; unsigned int real_size; - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->sectorsize < fsize); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * + (fsize >> fs_info->sectorsize_bits))); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -165,7 +153,7 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -179,7 +167,7 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -206,16 +194,18 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, */ if (folio->mapping) ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); } #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ - unsigned int __start_bit; \ + unsigned int __start_bit; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ + __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -233,7 +223,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, + *len = min_t(u64, folio_pos(folio) + folio_size(folio), orig_start + orig_len) - *start; } @@ -296,7 +286,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -323,13 +313,14 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap) { struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); + const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked; unsigned long flags; bool last = false; int cleared = 0; int bit; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -341,7 +332,7 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, } spin_lock_irqsave(&subpage->lock, flags); - for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bitmap, blocks_per_folio) { if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) cleared++; } @@ -352,15 +343,27 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } -#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_set(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) -#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_zero(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -372,7 +375,7 @@ void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) + if (subpage_test_bitmap_all_set(fs_info, folio, uptodate)) folio_mark_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -426,7 +429,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) + if (subpage_test_bitmap_all_zero(fs_info, folio, dirty)) last = true; spin_unlock_irqrestore(&subpage->lock, flags); return last; @@ -467,7 +470,7 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { + if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) { ASSERT(folio_test_writeback(folio)); folio_end_writeback(folio); } @@ -498,7 +501,7 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) + if (subpage_test_bitmap_all_zero(fs_info, folio, ordered)) folio_clear_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -513,7 +516,7 @@ void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + if (subpage_test_bitmap_all_set(fs_info, folio, checked)) folio_set_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -569,7 +572,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -579,7 +582,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -589,7 +592,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ @@ -597,7 +600,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -608,7 +611,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -619,10 +622,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ btrfs_subpage_clamp_range(folio, &start, &len); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ +} \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_set_func(folio); \ + return; \ + } \ + btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_clear_func(folio); \ + return; \ + } \ + btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, folio_test_uptodate); @@ -635,26 +660,29 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ { \ - const int sectors_per_page = fs_info->sectors_per_page; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + const struct btrfs_subpage *subpage = folio_get_private(folio); \ \ - ASSERT(sectors_per_page < BITS_PER_LONG); \ + ASSERT(blocks_per_folio < BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ - sectors_per_page * btrfs_bitmap_nr_##name, \ - sectors_per_page); \ + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ } #define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ { \ - const struct btrfs_subpage *subpage = folio_get_private(folio); \ unsigned long bitmap; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ - GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \ + GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), \ - fs_info->sectors_per_page, &bitmap); \ + blocks_per_folio, &bitmap); \ } /* @@ -672,7 +700,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { ASSERT(!folio_test_dirty(folio)); return; } @@ -707,7 +735,7 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, int ret; ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_get_private(folio); @@ -721,15 +749,37 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, } bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->nr_locked); - ASSERT(ret <= fs_info->sectors_per_page); + ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio)); spin_unlock_irqrestore(&subpage->lock, flags); } +/* + * Clear the dirty flag for the folio. + * + * If the affected folio is no longer dirty, return true. Otherwise return false. + */ +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb) +{ + bool last; + + if (!btrfs_meta_is_subpage(eb->fs_info)) { + folio_clear_dirty_for_io(folio); + return true; + } + + last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len); + if (last) { + folio_clear_dirty_for_io(folio); + return true; + } + return false; +} + void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; - const u32 sectors_per_page = fs_info->sectors_per_page; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; @@ -739,28 +789,28 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(sectors_per_page > 1); + ASSERT(blocks_per_folio > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - sectors_per_page, &uptodate_bitmap, - sectors_per_page, &dirty_bitmap, - sectors_per_page, &locked_bitmap, - sectors_per_page, &writeback_bitmap, - sectors_per_page, &ordered_bitmap, - sectors_per_page, &checked_bitmap); + blocks_per_folio, &uptodate_bitmap, + blocks_per_folio, &dirty_bitmap, + blocks_per_folio, &locked_bitmap, + blocks_per_folio, &writeback_bitmap, + blocks_per_folio, &ordered_bitmap, + blocks_per_folio, &checked_bitmap); } void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, @@ -771,10 +821,10 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(fs_info->sectors_per_page > 1); + ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 44fff1f4eac4..3042c5ea840a 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -6,10 +6,11 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/sizes.h> +#include "btrfs_inode.h" +#include "fs.h" struct address_space; struct folio; -struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -69,23 +70,49 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE +/* + * Subpage support for metadata is more complex, as we can have dummy extent + * buffers, where folios have no mapping to determine the owning inode. + * + * Thankfully we only need to check if node size is smaller than page size. + * Even with larger folio support, we will only allocate a folio as large as + * node size. + * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine. + */ +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return fs_info->nodesize < PAGE_SIZE; +} +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct folio *folio) +{ + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); + return fs_info->sectorsize < folio_size(folio); +} #else +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return false; +} static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, - struct address_space *mapping) + struct folio *folio) { + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); return false; } #endif int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type); /* Allocate additional data where page represents more than one sector */ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type); + size_t fsize, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); @@ -110,6 +137,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. + * + * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios. + * + * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there + * is no clamp version for metadata helpers, as we either go subpage + * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE, + * and our folio is never larger than nodesize). */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ @@ -129,7 +163,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len); \ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct folio *folio, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -155,6 +192,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb); void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dc4fee519ca6..40709e2a44fc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -84,7 +84,7 @@ struct btrfs_fs_context { u32 thread_pool_size; unsigned long long mount_opt; unsigned long compress_type:4; - unsigned int compress_level; + int compress_level; refcount_t refs; }; @@ -947,7 +947,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec static int btrfs_fill_super(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); int err; @@ -982,7 +982,7 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - sb->s_root = d_make_root(inode); + sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { err = -ENOMEM; goto fail_close; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 14f53f757555..b9af74498b0c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -411,7 +411,8 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* An artificial limit to only support 4K and PAGE_SIZE */ + if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) + ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); @@ -1342,17 +1343,18 @@ int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { - int ret; + char *retptr; *value_str = 0; value_str++; if (!value_ret) return -EINVAL; - ret = kstrtos64(value_str, 10, value_ret); - if (ret) + + *value_ret = memparse(value_str, &retptr); + /* There could be any trailing typos after the value. */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || *value_ret <= 0) return -EINVAL; - if (*value_ret < 0) - return -ERANGE; } #endif diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 3fc5c6f90dc4..0f94ae923210 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -7,6 +7,7 @@ #include <linux/compiler_types.h> #include <linux/kobject.h> +struct block_device; struct btrfs_fs_info; struct btrfs_device; struct btrfs_fs_devices; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 0a2dbfaaf49e..74aca7180a5a 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -525,7 +525,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, 0); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -542,7 +542,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) * Test again for case where the tree block is sectorsize aligned but * not nodesize aligned. */ - eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, sectorsize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -730,7 +730,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, SZ_1M); if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 56e61ac1cc64..609bb6c9c087 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1045,6 +1045,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = btrfs_add_chunk_map(fs_info, map); if (ret) { test_err("error adding chunk map to mapping tree"); + btrfs_free_chunk_map(map); goto out_free; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index aca83a98b75a..f26a394a9ec5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -160,7 +160,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) cache = list_first_entry(&transaction->deleted_bgs, struct btrfs_block_group, bg_list); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the deleted_bgs list during a transaction abort. + */ + spin_lock(&transaction->fs_info->unused_bgs_lock); list_del_init(&cache->bg_list); + spin_unlock(&transaction->fs_info->unused_bgs_lock); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } @@ -1635,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode = &pending->dir->vfs_inode; + struct btrfs_inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; @@ -1661,7 +1667,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * filesystem. */ nofs_flags = memalloc_nofs_save(); - pending->error = fscrypt_setup_filename(parent_inode, + pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); @@ -1690,8 +1696,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.objectid = objectid; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; @@ -1699,16 +1705,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - parent_root = BTRFS_I(parent_inode)->root; + parent_root = parent_inode->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) goto fail; - cur_time = current_time(parent_inode); + cur_time = current_time(&parent_inode->vfs_inode); /* * insert the directory item */ - ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); + ret = btrfs_set_inode_index(parent_inode, &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1716,7 +1722,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, - btrfs_ino(BTRFS_I(parent_inode)), + btrfs_ino(parent_inode), &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; @@ -1817,7 +1823,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_add_root_ref(trans, objectid, btrfs_root_id(parent_root), - btrfs_ino(BTRFS_I(parent_inode)), index, + btrfs_ino(parent_inode), index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1855,18 +1861,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, - BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + parent_inode, &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } - btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + + btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + fname.disk_name.len * 2); - inode_set_mtime_to_ts(parent_inode, - inode_set_ctime_current(parent_inode)); - ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, parent_inode); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -2096,7 +2102,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the new_bgs list during a transaction abort. + */ + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); } } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 955d1677e865..90dc094cfa5e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,10 +138,10 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ -static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) { unsigned int nofs_flag; - struct inode *inode; + struct btrfs_inode *inode; /* * We're holding a transaction handle whether we are logging or @@ -376,12 +376,12 @@ static int process_one_buffer(struct btrfs_root *log, } /* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. + * Item overwrite used by log replay. The given eb, slot and key all refer to + * the source data we are copying out. * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). + * The given root is for the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and will be + * released on exit). * * If the key is already in the destination tree the existing item is * overwritten. If the existing item isn't big enough, it is extended. @@ -401,6 +401,8 @@ static int overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; + struct extent_buffer *dst_eb; + int dst_slot; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; /* @@ -420,11 +422,13 @@ static int overwrite_item(struct btrfs_trans_handle *trans, if (ret < 0) return ret; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + if (ret == 0) { char *src_copy; - char *dst_copy; - u32 dst_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 dst_size = btrfs_item_size(dst_eb, dst_slot); + if (dst_size != item_size) goto insert; @@ -432,23 +436,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); return 0; } - dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); - if (!dst_copy || !src_copy) { + if (!src_copy) { btrfs_release_path(path); - kfree(dst_copy); - kfree(src_copy); return -ENOMEM; } read_extent_buffer(eb, src_copy, src_ptr, item_size); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); + ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, - item_size); - ret = memcmp(dst_copy, src_copy, item_size); - - kfree(dst_copy); kfree(src_copy); /* * they have the same contents, just return, this saves @@ -470,9 +467,9 @@ static int overwrite_item(struct btrfs_trans_handle *trans, u64 nbytes; u32 mode; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], + item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); - nbytes = btrfs_inode_nbytes(path->nodes[0], item); + nbytes = btrfs_inode_nbytes(dst_eb, item); item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, nbytes); @@ -514,11 +511,13 @@ insert: key, item_size); path->skip_release_on_error = 0; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { - u32 found_size; - found_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 found_size = btrfs_item_size(dst_eb, dst_slot); + if (found_size > item_size) btrfs_truncate_item(trans, path, item_size, 1); else if (found_size < item_size) @@ -526,8 +525,7 @@ insert: } else if (ret) { return ret; } - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); /* don't overwrite an existing inode if the generation number * was logged as zero. This is done when the tree logging code @@ -546,7 +544,6 @@ insert: dst_item = (struct btrfs_inode_item *)dst_ptr; if (btrfs_inode_generation(eb, src_item) == 0) { - struct extent_buffer *dst_eb = path->nodes[0]; const u64 ino_size = btrfs_inode_size(eb, src_item); /* @@ -564,30 +561,28 @@ insert: } if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && - S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; - saved_i_size = btrfs_inode_size(path->nodes[0], - dst_item); + saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(path->nodes[0], eb, dst_ptr, - src_ptr, item_size); + copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + btrfs_set_inode_size(dst_eb, dst_item, saved_i_size); } /* make sure the generation is filled in */ if (key->type == BTRFS_INODE_ITEM_KEY) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { - btrfs_set_inode_generation(path->nodes[0], dst_item, - trans->transid); - } + if (btrfs_inode_generation(dst_eb, dst_item) == 0) + btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: btrfs_release_path(path); @@ -613,14 +608,14 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ -static noinline struct inode *read_one_inode(struct btrfs_root *root, - u64 objectid) +static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, + u64 objectid) { - struct inode *inode; + struct btrfs_inode *inode; inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) - inode = NULL; + return NULL; return inode; } @@ -649,7 +644,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, u64 start = key->offset; u64 nbytes = 0; struct btrfs_file_extent_item *item; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; unsigned long size; int ret = 0; @@ -688,31 +683,23 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, - btrfs_ino(BTRFS_I(inode)), start, 0); + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { - struct btrfs_file_extent_item cmp1; - struct btrfs_file_extent_item cmp2; - struct btrfs_file_extent_item *existing; - struct extent_buffer *leaf; - - leaf = path->nodes[0]; - existing = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); + struct btrfs_file_extent_item existing; + unsigned long ptr; - read_extent_buffer(eb, &cmp1, (unsigned long)item, - sizeof(cmp1)); - read_extent_buffer(leaf, &cmp2, (unsigned long)existing, - sizeof(cmp2)); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + sizeof(existing)) == 0) { btrfs_release_path(path); goto out; } @@ -723,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; @@ -747,8 +734,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, (unsigned long)item, sizeof(*item)); ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); offset = key->offset - btrfs_file_extent_offset(eb, item); /* @@ -901,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, - extent_end - start); + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) goto out; update_inode: - btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, inode); out: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -947,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; - struct inode *inode; + struct btrfs_inode *inode; struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; @@ -972,10 +958,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -1148,7 +1134,7 @@ again: u32 item_size; u32 cur_offset = 0; unsigned long base; - struct inode *victim_parent; + struct btrfs_inode *victim_parent; leaf = path->nodes[0]; @@ -1188,10 +1174,10 @@ again: btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, - BTRFS_I(victim_parent), + victim_parent, inode, &victim_name); } - iput(victim_parent); + iput(&victim_parent->vfs_inode); kfree(victim_name.name); if (ret) return ret; @@ -1325,7 +1311,7 @@ again: ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { - struct inode *dir; + struct btrfs_inode *dir; btrfs_release_path(path); dir = read_one_inode(root, parent_id); @@ -1334,10 +1320,9 @@ again: kfree(name.name); goto out; } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (ret) goto out; goto again; @@ -1369,8 +1354,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir = NULL; - struct inode *inode = NULL; + struct btrfs_inode *dir = NULL; + struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; @@ -1435,8 +1420,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, &name); + ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1447,8 +1432,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), BTRFS_I(inode), + ret = __add_inode_ref(trans, root, path, log, dir, inode, inode_objectid, parent_objectid, ref_index, &name); if (ret) { @@ -1458,12 +1442,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 0, ref_index); + ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) goto out; - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } @@ -1473,7 +1456,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, kfree(name.name); name.name = NULL; if (log_ref_ver) { - iput(dir); + iput(&dir->vfs_inode); dir = NULL; } } @@ -1486,8 +1469,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, - key); + ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); if (ret) goto out; @@ -1496,8 +1478,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); kfree(name.name); - iput(dir); - iput(inode); + if (dir) + iput(&dir->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1611,25 +1595,25 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_path *path; int ret; u64 nlink = 0; - u64 ino = btrfs_ino(BTRFS_I(inode)); + const u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = count_inode_refs(BTRFS_I(inode), path); + ret = count_inode_refs(inode, path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(BTRFS_I(inode), path); + ret = count_inode_extrefs(inode, path); if (ret < 0) goto out; @@ -1637,17 +1621,17 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, ret = 0; - if (nlink != inode->i_nlink) { - set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (nlink != inode->vfs_inode.i_nlink) { + set_nlink(&inode->vfs_inode, nlink); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->index_cnt = (u64)-1; - if (inode->i_nlink == 0) { - if (S_ISDIR(inode->i_mode)) { + if (inode->vfs_inode.i_nlink == 0) { + if (S_ISDIR(inode->vfs_inode.i_mode)) { ret = replay_dir_deletes(trans, root, NULL, path, ino, 1); if (ret) @@ -1669,12 +1653,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct inode *inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) break; @@ -1703,7 +1688,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, } ret = fixup_inode_link_count(trans, inode); - iput(inode); + iput(&inode->vfs_inode); if (ret) break; @@ -1731,12 +1716,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, { struct btrfs_key key; int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; + struct inode *vfs_inode; inode = read_one_inode(root, objectid); if (!inode) return -EIO; + vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; @@ -1745,15 +1732,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - if (!inode->i_nlink) - set_nlink(inode, 1); + if (!vfs_inode->i_nlink) + set_nlink(vfs_inode, 1); else - inc_nlink(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); } else if (ret == -EEXIST) { ret = 0; } - iput(inode); + iput(vfs_inode); return ret; } @@ -1769,8 +1756,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_key *location) { - struct inode *inode; - struct inode *dir; + struct btrfs_inode *inode; + struct btrfs_inode *dir; int ret; inode = read_one_inode(root, location->objectid); @@ -1779,17 +1766,16 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, dir = read_one_inode(root, dirid); if (!dir) { - iput(inode); + iput(&inode->vfs_inode); return -EIO; } - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - 1, index); + ret = btrfs_add_link(trans, dir, inode, name, 1, index); /* FIXME, put inode into FIXUP list */ - iput(inode); - iput(dir); + iput(&inode->vfs_inode); + iput(&dir->vfs_inode); return ret; } @@ -1851,7 +1837,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool index_dst_matches = false; struct btrfs_key log_key; struct btrfs_key search_key; - struct inode *dir; + struct btrfs_inode *dir; u8 log_flags; bool exists; int ret; @@ -1881,9 +1867,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1898,9 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - index_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -1955,11 +1939,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); + ret = btrfs_update_inode(trans, dir); } kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (!ret && name_added) ret = 1; return ret; @@ -2116,16 +2100,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_key *dir_key) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; struct fscrypt_str name = { 0 }; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; struct btrfs_key location; /* @@ -2172,9 +2156,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, if (ret) goto out; - inc_nlink(inode); - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - &name); + inc_nlink(&inode->vfs_inode); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2184,7 +2167,8 @@ out: btrfs_release_path(path); btrfs_release_path(log_path); kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -2308,7 +2292,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; - struct inode *dir; + struct btrfs_inode *dir; dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; @@ -2385,7 +2369,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_free_path(log_path); - iput(dir); + iput(&dir->vfs_inode); return ret; } @@ -2479,7 +2463,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, */ if (S_ISREG(mode)) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct inode *inode; + struct btrfs_inode *inode; u64 from; inode = read_one_inode(root, key.objectid); @@ -2487,22 +2471,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, ret = -EIO; break; } - from = ALIGN(i_size_read(inode), + from = ALIGN(i_size_read(&inode->vfs_inode), root->fs_info->sectorsize); drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, - BTRFS_I(inode), + ret = btrfs_drop_extents(wc->trans, root, inode, &drop_args); if (!ret) { - inode_sub_bytes(inode, + inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, - BTRFS_I(inode)); + ret = btrfs_update_inode(wc->trans, inode); } - iput(inode); + iput(&inode->vfs_inode); if (ret) break; } @@ -3560,8 +3542,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_dir_log_item *item; key.objectid = dirid; - key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.offset = first_offset; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); /* * -EEXIST is fine and can happen sporadically when we are logging a @@ -5481,7 +5463,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, ihold(&curr_inode->vfs_inode); while (true) { - struct inode *vfs_inode; struct btrfs_key key; struct btrfs_key found_key; u64 next_index; @@ -5497,7 +5478,7 @@ again: struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; - struct inode *di_inode; + struct btrfs_inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; @@ -5524,17 +5505,16 @@ again: goto out; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); + btrfs_add_delayed_iput(di_inode); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5576,14 +5556,13 @@ again: kfree(dir_elem); btrfs_add_delayed_iput(curr_inode); - curr_inode = NULL; - vfs_inode = btrfs_iget_logging(ino, root); - if (IS_ERR(vfs_inode)) { - ret = PTR_ERR(vfs_inode); + curr_inode = btrfs_iget_logging(ino, root); + if (IS_ERR(curr_inode)) { + ret = PTR_ERR(curr_inode); + curr_inode = NULL; break; } - curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); @@ -5661,7 +5640,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - struct inode *inode; + struct btrfs_inode *inode; /* * It's rare to have a lot of conflicting inodes, in practice it is not @@ -5752,12 +5731,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * inode in LOG_INODE_EXISTS mode and rename operations update the log, * so that the log ends up with the new name and without the old name. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); return 0; } - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5793,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ while (!list_empty(&ctx->conflict_inodes)) { struct btrfs_ino_list *curr; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; u64 parent; @@ -5829,9 +5808,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * dir index key range logged for the directory. So we * must make sure the deletion is recorded. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; continue; @@ -5847,8 +5825,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * it again because if some other task logged the inode after * that, we can avoid doing it again. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); continue; } @@ -5859,8 +5837,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; } @@ -6351,7 +6329,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, list_for_each_entry(item, delayed_ins_list, log_list) { struct btrfs_dir_item *dir_item; - struct inode *di_inode; + struct btrfs_inode *di_inode; struct btrfs_key key; int log_mode = LOG_INODE_EXISTS; @@ -6367,8 +6345,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, break; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); continue; } @@ -6376,12 +6354,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + ret = log_new_dir_dentries(trans, di_inode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + btrfs_add_delayed_iput(di_inode); if (ret) break; @@ -6789,7 +6767,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; - struct inode *dir_inode; + struct btrfs_inode *dir_inode; inode_key.type = BTRFS_INODE_ITEM_KEY; inode_key.offset = 0; @@ -6838,18 +6816,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } - if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + if (!need_log_inode(trans, dir_inode)) { + btrfs_add_delayed_iput(dir_inode); continue; } ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), - LOG_INODE_ALL, ctx); + ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, - BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + ret = log_new_dir_dentries(trans, dir_inode, ctx); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -6874,7 +6850,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; struct btrfs_key search_key; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; int ret = 0; @@ -6889,11 +6865,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid && - need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -7061,26 +7036,20 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; - bool log_dentries = false; + bool log_dentries; - if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_test_opt(fs_info, NOTREELOG)) + return BTRFS_LOG_FORCE_COMMIT; - if (btrfs_root_refs(&root->root_item) == 0) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_refs(&root->root_item) == 0) + return BTRFS_LOG_FORCE_COMMIT; /* * If we're logging an inode from a subvolume created in the current * transaction we must force a commit since the root is not persisted. */ - if (btrfs_root_generation(&root->root_item) == trans->transid) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_generation(&root->root_item) == trans->transid) + return BTRFS_LOG_FORCE_COMMIT; /* * Skip already logged inodes or inodes corresponding to tmpfiles @@ -7089,14 +7058,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, */ if ((btrfs_inode_in_log(inode, trans->transid) && list_empty(&ctx->ordered_extents)) || - inode->vfs_inode.i_nlink == 0) { - ret = BTRFS_NO_LOG_SYNC; - goto end_no_trans; - } + inode->vfs_inode.i_nlink == 0) + return BTRFS_NO_LOG_SYNC; ret = start_log_trans(trans, root, ctx); if (ret) - goto end_no_trans; + return ret; ret = btrfs_log_inode(trans, inode, inode_only, ctx); if (ret) @@ -7115,8 +7082,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) - log_dentries = true; + /* + * Track if we need to log dentries because ctx->log_new_dentries can + * be modified in the call chains below. + */ + log_dentries = ctx->log_new_dentries; /* * On unlink we must make sure all our current and old parent directory @@ -7171,8 +7141,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (log_dentries) ret = log_new_dir_dentries(trans, inode, ctx); - else - ret = 0; end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); @@ -7182,7 +7150,7 @@ end_trans: if (ret) btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); -end_no_trans: + return ret; } @@ -7247,8 +7215,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index e97ad824ae16..b7a96a005487 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -485,7 +485,7 @@ static int rollback_verity(struct btrfs_inode *inode) goto out; } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); @@ -552,7 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, goto out; } inode->ro_flags |= BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3f8afbd1ebb5..c8c21c55be53 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1798,8 +1798,8 @@ again: path->skip_locking = 1; key.objectid = device->devid; - key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = search_start; ret = btrfs_search_backwards(root, &key, path); if (ret < 0) @@ -1918,8 +1918,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = device->devid; - key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -2721,8 +2721,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = BTRFS_DEV_ITEM_KEY; + key.offset = 0; while (1) { btrfs_reserve_chunk_metadata(trans, false); @@ -3119,8 +3119,8 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) return -ENOMEM; key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = chunk_offset; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -3577,8 +3577,8 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -4184,8 +4184,8 @@ again: bctl->sys.limit = limit_sys; } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { if ((!counting && atomic_read(&fs_info->balance_pause_req)) || @@ -5001,8 +5001,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) again: key.objectid = device->devid; - key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = (u64)-1; do { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -7539,8 +7539,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = 0; + key.offset = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *node = path->nodes[1]; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 120f65e21eeb..e247d551da67 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -7,6 +7,7 @@ #define BTRFS_VOLUMES_H #include <linux/blk_types.h> +#include <linux/blkdev.h> #include <linux/sizes.h> #include <linux/atomic.h> #include <linux/sort.h> @@ -18,14 +19,17 @@ #include <linux/completion.h> #include <linux/rbtree.h> #include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" #include "rcu-string.h" +#include "extent-io-tree.h" struct block_device; struct bdev_handle; struct btrfs_fs_info; struct btrfs_block_group; struct btrfs_trans_handle; +struct btrfs_transaction; struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 8dc4cf49f6f0..0ce10e4ec836 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,6 +6,8 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H +#include <linux/types.h> + struct dentry; struct inode; struct qstr; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c9e92c6941ec..545f413d81fc 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -94,6 +94,47 @@ fail: return ERR_PTR(-ENOMEM); } +/* + * Helper for S390x with hardware zlib compression support. + * + * That hardware acceleration requires a buffer size larger than a single page + * to get ideal performance, thus we need to do the memory copy rather than + * use the page cache directly as input buffer. + */ +static int copy_data_into_buffer(struct address_space *mapping, + struct workspace *workspace, u64 filepos, + unsigned long length) +{ + u64 cur = filepos; + + /* It's only for hardware accelerated zlib code. */ + ASSERT(zlib_deflate_dfltcc_enabled()); + + while (cur < filepos + length) { + struct folio *folio; + void *data_in; + unsigned int offset; + unsigned long copy_length; + int ret; + + ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio); + if (ret < 0) + return ret; + /* No large folio support yet. */ + ASSERT(!folio_test_large(folio)); + + offset = offset_in_folio(folio, cur); + copy_length = min(folio_size(folio) - offset, + filepos + length - cur); + + data_in = kmap_local_folio(folio, offset); + memcpy(workspace->buf + cur - filepos, data_in, copy_length); + kunmap_local(data_in); + cur += copy_length; + } + return 0; +} + int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) @@ -105,8 +146,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, int nr_folios = 0; struct folio *in_folio = NULL; struct folio *out_folio = NULL; - unsigned long bytes_left; - unsigned int in_buf_folios; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; @@ -150,34 +189,21 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, * the workspace buffer if required. */ if (workspace->strm.avail_in == 0) { - bytes_left = len - workspace->strm.total_in; - in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_folios > 1) { - int i; - - /* S390 hardware acceleration path, not subpage. */ - ASSERT(!btrfs_is_subpage( - inode_to_fs_info(mapping->host), - mapping)); - for (i = 0; i < in_buf_folios; i++) { - if (data_in) { - kunmap_local(data_in); - folio_put(in_folio); - data_in = NULL; - } - ret = btrfs_compress_filemap_get_folio(mapping, - start, &in_folio); - if (ret < 0) - goto out; - data_in = kmap_local_folio(in_folio, 0); - copy_page(workspace->buf + i * PAGE_SIZE, - data_in); - start += PAGE_SIZE; - } + unsigned long bytes_left = len - workspace->strm.total_in; + unsigned int copy_length = min(bytes_left, workspace->buf_size); + + /* + * This can only happen when hardware zlib compression is + * enabled. + */ + if (copy_length > PAGE_SIZE) { + ret = copy_data_into_buffer(mapping, workspace, + start, copy_length); + if (ret < 0) + goto out; + start += copy_length; workspace->strm.next_in = workspace->buf; - workspace->strm.avail_in = min(bytes_left, - in_buf_folios << PAGE_SHIFT); + workspace->strm.avail_in = copy_length; } else { unsigned int pg_off; unsigned int cur_len; @@ -463,6 +489,7 @@ out: const struct btrfs_compress_op btrfs_zlib_compress = { .workspace_manager = &wsm, + .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 73e0aa9fc08a..fb8b8b29c169 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2111,6 +2111,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) physical = map->stripes[i].physical; zinfo = device->zone_info; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2272,6 +2275,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_zoned_device_info *zinfo = device->zone_info; unsigned int nofs_flags; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2325,6 +2331,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) if (!btrfs_is_zoned(fs_info)) return true; + if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags)) + return false; + /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); spin_lock(&fs_info->zone_active_bgs_lock); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 5232b56d5892..cd5f38d6fbaa 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -26,11 +26,12 @@ #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 +#define ZSTD_BTRFS_MIN_LEVEL -15 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, +static zstd_parameters zstd_get_btrfs_parameters(int level, size_t src_len) { zstd_parameters params = zstd_get_params(level, src_len); @@ -45,13 +46,14 @@ struct workspace { void *mem; size_t size; char *buf; - unsigned int level; - unsigned int req_level; + int level; + int req_level; unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; zstd_in_buffer in_buf; zstd_out_buffer out_buf; + zstd_parameters params; }; /* @@ -93,8 +95,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list) return container_of(list, struct workspace, list); } -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_alloc_workspace(unsigned int level); +static inline int clip_level(int level) +{ + return max(0, level - 1); +} /* * Timer callback to free unused workspaces. @@ -123,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_for_each_prev_safe(pos, next, &wsm.lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); - unsigned int level; + int level; if (time_after(victim->last_used, reclaim_threshold)) break; @@ -137,8 +141,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level - 1])) - clear_bit(level - 1, &wsm.active_map); + if (list_empty(&wsm.idle_ws[level])) + clear_bit(level, &wsm.active_map); } @@ -160,9 +164,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) static void zstd_calc_ws_mem_sizes(void) { size_t max_size = 0; - unsigned int level; + int level; - for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + if (level == 0) + continue; zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = @@ -171,7 +177,8 @@ static void zstd_calc_ws_mem_sizes(void) zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); - zstd_ws_mem_sizes[level - 1] = max_size; + /* Use level 1 workspace size for all the fast mode negative levels. */ + zstd_ws_mem_sizes[clip_level(level)] = max_size; } } @@ -233,11 +240,11 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(unsigned int level) +static struct list_head *zstd_find_workspace(int level) { struct list_head *ws; struct workspace *workspace; - int i = level - 1; + int i = clip_level(level); spin_lock_bh(&wsm.lock); for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { @@ -247,7 +254,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; - if (level == workspace->level) + if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); if (list_empty(&wsm.idle_ws[i])) clear_bit(i, &wsm.active_map); @@ -270,7 +277,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(unsigned int level) +struct list_head *zstd_get_workspace(int level) { struct list_head *ws; unsigned int nofs_flag; @@ -319,7 +326,7 @@ void zstd_put_workspace(struct list_head *ws) spin_lock_bh(&wsm.lock); /* A node is only taken off the lru if we are the corresponding level */ - if (workspace->req_level == workspace->level) { + if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); @@ -332,13 +339,13 @@ void zstd_put_workspace(struct list_head *ws) } } - set_bit(workspace->level - 1, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); + set_bit(workspace->level, &wsm.active_map); + list_add(&workspace->list, &wsm.idle_ws[workspace->level]); workspace->req_level = 0; spin_unlock_bh(&wsm.lock); - if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) + if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) cond_wake_up(&wsm.wait); } @@ -351,7 +358,7 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(unsigned int level) +struct list_head *zstd_alloc_workspace(int level) { struct workspace *workspace; @@ -359,8 +366,9 @@ struct list_head *zstd_alloc_workspace(unsigned int level) if (!workspace) return ERR_PTR(-ENOMEM); - workspace->size = zstd_ws_mem_sizes[level - 1]; - workspace->level = level; + /* Use level 1 workspace size for all the fast mode negative levels. */ + workspace->size = zstd_ws_mem_sizes[clip_level(level)]; + workspace->level = clip_level(level); workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); @@ -393,17 +401,15 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; - unsigned int pg_off; unsigned int cur_len; - zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, - len); + workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ - stream = zstd_init_cstream(¶ms, len, workspace->mem, + stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = BTRFS_I(mapping->host); @@ -420,9 +426,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_page(start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; @@ -506,9 +511,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + workspace->in_buf.src = kmap_local_folio(in_folio, + offset_in_page(start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; } @@ -717,6 +722,7 @@ finish: const struct btrfs_compress_op btrfs_zstd_compress = { /* ZSTD uses own workspace manager */ .workspace_manager = NULL, + .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; diff --git a/fs/buffer.c b/fs/buffer.c index cc8452f60251..194eacbefc95 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2361,9 +2361,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) { struct inode *inode = folio->mapping->host; sector_t iblock, lblock; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + struct buffer_head *bh, *head, *prev = NULL; size_t blocksize; - int nr, i; int fully_mapped = 1; bool page_error = false; loff_t limit = i_size_read(inode); @@ -2372,16 +2371,12 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) limit = inode->i_sb->s_maxbytes; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; iblock = div_u64(folio_pos(folio), blocksize); lblock = div_u64(limit + blocksize - 1, blocksize); bh = head; - nr = 0; - i = 0; do { if (buffer_uptodate(bh)) @@ -2398,7 +2393,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) page_error = true; } if (!buffer_mapped(bh)) { - folio_zero_range(folio, i * blocksize, + folio_zero_range(folio, bh_offset(bh), blocksize); if (!err) set_buffer_uptodate(bh); @@ -2411,40 +2406,33 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (buffer_uptodate(bh)) continue; } - arr[nr++] = bh; - } while (i++, iblock++, (bh = bh->b_this_page) != head); - - if (fully_mapped) - folio_set_mappedtodisk(folio); - - if (!nr) { - /* - * All buffers are uptodate or get_block() returned an - * error when trying to map them - we can finish the read. - */ - folio_end_read(folio, !page_error); - return 0; - } - /* Stage two: lock the buffers */ - for (i = 0; i < nr; i++) { - bh = arr[i]; lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + mark_buffer_async_read(bh); - } + if (prev) + submit_bh(REQ_OP_READ, prev); + prev = bh; + } while (iblock++, (bh = bh->b_this_page) != head); + + if (fully_mapped) + folio_set_mappedtodisk(folio); /* - * Stage 3: start the IO. Check for uptodateness - * inside the buffer lock in case another process reading - * the underlying blockdev brought it uptodate (the sct fix). + * All buffers are uptodate or get_block() returned an error + * when trying to map them - we must finish the read because + * end_buffer_async_read() will never be called on any buffer + * in this folio. */ - for (i = 0; i < nr; i++) { - bh = arr[i]; - if (buffer_uptodate(bh)) - end_buffer_async_read(bh, 1); - else - submit_bh(REQ_OP_READ, bh); - } + if (prev) + submit_bh(REQ_OP_READ, prev); + else + folio_end_read(folio, !page_error); + return 0; } EXPORT_SYMBOL(block_read_full_folio); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7cf59713f0f7..83a60126de0f 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -128,18 +128,19 @@ retry: ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; - ret = cachefiles_inject_write_error(); - if (ret == 0) - ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); - if (ret < 0) { + subdir = ERR_PTR(cachefiles_inject_write_error()); + if (!IS_ERR(subdir)) + subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); + ret = PTR_ERR(subdir); + if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); goto mkdir_error; } trace_cachefiles_mkdir(dir, subdir); - if (unlikely(d_unhashed(subdir))) { - cachefiles_put_directory(subdir); + if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) { + dput(subdir); goto retry; } ASSERT(d_backing_inode(subdir)); @@ -195,7 +196,8 @@ mark_error: mkdir_error: inode_unlock(d_inode(dir)); - dput(subdir); + if (!IS_ERR(subdir)) + dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index fe3de9ad57bf..d9bc67176128 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -317,8 +317,9 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, goto err_free_id; } - anon_file->file = anon_inode_getfile("[cachefiles]", - &cachefiles_ondemand_fd_fops, object, O_WRONLY); + anon_file->file = anon_inode_getfile_fmode("[cachefiles]", + &cachefiles_ondemand_fd_fops, object, + O_WRONLY, FMODE_PWRITE | FMODE_LSEEK); if (IS_ERR(anon_file->file)) { ret = PTR_ERR(anon_file->file); goto err_put_fd; @@ -333,8 +334,6 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, goto err_put_file; } - anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; - load = (void *)req->msg.data; load->fd = anon_file->fd; object->ondemand->ondemand_id = object_id; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index f5224a566b69..29be367905a1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -82,6 +82,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci; struct ceph_snap_context *snapc; @@ -92,6 +93,8 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) return false; } + atomic64_inc(&mdsc->dirty_folios); + ci = ceph_inode(inode); /* dirty the head */ @@ -568,7 +571,36 @@ struct ceph_writeback_ctl u64 truncate_size; u32 truncate_seq; bool size_stable; + bool head_snapc; + struct ceph_snap_context *snapc; + struct ceph_snap_context *last_snapc; + + bool done; + bool should_loop; + bool range_whole; + pgoff_t start_index; + pgoff_t index; + pgoff_t end; + xa_mark_t tag; + + pgoff_t strip_unit_end; + unsigned int wsize; + unsigned int nr_folios; + unsigned int max_pages; + unsigned int locked_pages; + + int op_idx; + int num_ops; + u64 offset; + u64 len; + + struct folio_batch fbatch; + unsigned int processed_in_fbatch; + + bool from_pool; + struct page **pages; + struct page **data_pages; }; /* @@ -666,22 +698,23 @@ static u64 get_writepages_data_length(struct inode *inode, } /* - * Write a single page, but leave the page locked. + * Write a folio, but leave it locked. * * If we get a write error, mark the mapping for error, but still adjust the - * dirty page accounting (i.e., page is no longer dirty). + * dirty page accounting (i.e., folio is no longer dirty). */ -static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +static int write_folio_nounlock(struct folio *folio, + struct writeback_control *wbc) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; + struct page *page = &folio->page; + struct inode *inode = folio->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_snap_context *snapc, *oldest; - loff_t page_off = page_offset(page); + loff_t page_off = folio_pos(folio); int err; - loff_t len = thp_size(page); + loff_t len = folio_size(folio); loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; @@ -689,27 +722,27 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) bool caching = ceph_is_cache_enabled(inode); struct page *bounce_page = NULL; - doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page, - page->index); + doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio, + folio->index); if (ceph_inode_is_shutdown(inode)) return -EIO; /* verify this is a writeable snap context */ - snapc = page_snap_context(page); + snapc = page_snap_context(&folio->page); if (!snapc) { - doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode), - page); + doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode), + folio); return 0; } oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { - doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n", - ceph_vinop(inode), page, snapc); + doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n", + ceph_vinop(inode), folio, snapc); /* we should only noop if called by kswapd */ WARN_ON(!(current->flags & PF_MEMALLOC)); ceph_put_snap_context(oldest); - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); return 0; } ceph_put_snap_context(oldest); @@ -726,8 +759,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) len = ceph_wbc.i_size - page_off; wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; - doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n", - ceph_vinop(inode), page, page->index, page_off, wlen, snapc, + doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n", + ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > @@ -740,32 +773,32 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); if (IS_ERR(req)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); return PTR_ERR(req); } if (wlen < len) len = wlen; - set_page_writeback(page); + folio_start_writeback(folio); if (caching) - ceph_set_page_fscache(page); + ceph_set_page_fscache(&folio->page); ceph_fscache_write_to_cache(inode, page_off, len, caching); if (IS_ENCRYPTED(inode)) { - bounce_page = fscrypt_encrypt_pagecache_blocks(page, + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, CEPH_FSCRYPT_BLOCK_SIZE, 0, GFP_NOFS); if (IS_ERR(bounce_page)) { - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); ceph_osdc_put_request(req); return PTR_ERR(bounce_page); } } /* it may be a short write due to an object boundary */ - WARN_ON_ONCE(len > thp_size(page)); + WARN_ON_ONCE(len > folio_size(folio)); osd_req_op_extent_osd_data_pages(req, 0, bounce_page ? &bounce_page : &page, wlen, 0, false, false); @@ -791,25 +824,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (err == -ERESTARTSYS) { /* killed by SIGKILL */ doutc(cl, "%llx.%llx interrupted page %p\n", - ceph_vinop(inode), page); - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + ceph_vinop(inode), folio); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); return err; } if (err == -EBLOCKLISTED) fsc->blocklisted = true; - doutc(cl, "%llx.%llx setting page/mapping error %d %p\n", - ceph_vinop(inode), err, page); + doutc(cl, "%llx.%llx setting mapping error %d %p\n", + ceph_vinop(inode), err, folio); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { doutc(cl, "%llx.%llx cleaned page %p\n", - ceph_vinop(inode), page); + ceph_vinop(inode), folio); err = 0; /* vfs expects us to return 0 */ } - oldest = detach_page_private(page); + oldest = folio_detach_private(folio); WARN_ON_ONCE(oldest != snapc); - end_page_writeback(page); + folio_end_writeback(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ @@ -820,32 +853,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) return err; } -static int ceph_writepage(struct page *page, struct writeback_control *wbc) -{ - int err; - struct inode *inode = page->mapping->host; - BUG_ON(!inode); - ihold(inode); - - if (wbc->sync_mode == WB_SYNC_NONE && - ceph_inode_to_fs_client(inode)->write_congested) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; - } - - folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ - - err = writepage_nounlock(page, wbc); - if (err == -ERESTARTSYS) { - /* direct memory reclaimer was killed by SIGKILL. return 0 - * to prevent caller from setting mapping/page error */ - err = 0; - } - unlock_page(page); - iput(inode); - return err; -} - /* * async writeback completion handler. * @@ -865,6 +872,7 @@ static void writepages_finish(struct ceph_osd_request *req) struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); unsigned int len = 0; bool remove_page; @@ -920,6 +928,12 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); + + if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) { + wake_up_all(&mdsc->flush_end_wq); + WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0); + } + doutc(cl, "unlocking %p\n", page); if (remove_page) @@ -949,36 +963,13 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_dec_osd_stopping_blocker(fsc->mdsc); } -/* - * initiate async writeback - */ -static int ceph_writepages_start(struct address_space *mapping, - struct writeback_control *wbc) +static inline +bool is_forced_umount(struct address_space *mapping) { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; - struct ceph_vino vino = ceph_vino(inode); - pgoff_t index, start_index, end = -1; - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; - struct folio_batch fbatch; - int rc = 0; - unsigned int wsize = i_blocksize(inode); - struct ceph_osd_request *req = NULL; - struct ceph_writeback_ctl ceph_wbc; - bool should_loop, range_whole = false; - bool done = false; - bool caching = ceph_is_cache_enabled(inode); - xa_mark_t tag; - - if (wbc->sync_mode == WB_SYNC_NONE && - fsc->write_congested) - return 0; - - doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), - wbc->sync_mode == WB_SYNC_NONE ? "NONE" : - (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ceph_inode_is_shutdown(inode)) { if (ci->i_wrbuffer_ref > 0) { @@ -987,388 +978,733 @@ static int ceph_writepages_start(struct address_space *mapping, ceph_vinop(inode), ceph_ino(inode)); } mapping_set_error(mapping, -EIO); - return -EIO; /* we're in a forced umount, don't write! */ + return true; } + + return false; +} + +static inline +unsigned int ceph_define_write_size(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + unsigned int wsize = i_blocksize(inode); + if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - folio_batch_init(&fbatch); + return wsize; +} + +static inline +void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_init(&ceph_wbc->fbatch); + ceph_wbc->processed_in_fbatch = 0; +} + +static inline +void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_release(&ceph_wbc->fbatch); + ceph_folio_batch_init(ceph_wbc); +} + +static inline +void ceph_init_writeback_ctl(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + ceph_wbc->snapc = NULL; + ceph_wbc->last_snapc = NULL; + + ceph_wbc->strip_unit_end = 0; + ceph_wbc->wsize = ceph_define_write_size(mapping); - start_index = wbc->range_cyclic ? mapping->writeback_index : 0; - index = start_index; + ceph_wbc->nr_folios = 0; + ceph_wbc->max_pages = 0; + ceph_wbc->locked_pages = 0; + + ceph_wbc->done = false; + ceph_wbc->should_loop = false; + ceph_wbc->range_whole = false; + + ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - tag = PAGECACHE_TAG_TOWRITE; + ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; } else { - tag = PAGECACHE_TAG_DIRTY; + ceph_wbc->tag = PAGECACHE_TAG_DIRTY; } -retry: + + ceph_wbc->op_idx = -1; + ceph_wbc->num_ops = 0; + ceph_wbc->offset = 0; + ceph_wbc->len = 0; + ceph_wbc->from_pool = false; + + ceph_folio_batch_init(ceph_wbc); + + ceph_wbc->pages = NULL; + ceph_wbc->data_pages = NULL; +} + +static inline +int ceph_define_writeback_range(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + /* find oldest snap context with dirty data */ - snapc = get_oldest_context(inode, &ceph_wbc, NULL); - if (!snapc) { + ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL); + if (!ceph_wbc->snapc) { /* hmm, why does writepages get called when there is no dirty data? */ doutc(cl, " no snap context with dirty data?\n"); - goto out; + return -ENODATA; } - doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc, - snapc->seq, snapc->num_snaps); - should_loop = false; - if (ceph_wbc.head_snapc && snapc != last_snapc) { + doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", + ceph_wbc->snapc, ceph_wbc->snapc->seq, + ceph_wbc->snapc->num_snaps); + + ceph_wbc->should_loop = false; + + if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) { /* where to start/end? */ if (wbc->range_cyclic) { - index = start_index; - end = -1; - if (index > 0) - should_loop = true; - doutc(cl, " cyclic, start at %lu\n", index); + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; + doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + ceph_wbc->index = wbc->range_start >> PAGE_SHIFT; + ceph_wbc->end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = true; - doutc(cl, " not cyclic, %lu to %lu\n", index, end); + ceph_wbc->range_whole = true; + doutc(cl, " not cyclic, %lu to %lu\n", + ceph_wbc->index, ceph_wbc->end); } - } else if (!ceph_wbc.head_snapc) { + } else if (!ceph_wbc->head_snapc) { /* Do not respect wbc->range_{start,end}. Dirty pages * in that range can be associated with newer snapc. * They are not writeable until we write all dirty pages * associated with 'snapc' get written */ - if (index > 0) - should_loop = true; + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; doutc(cl, " non-head snapc, range whole\n"); } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); + ceph_put_snap_context(ceph_wbc->last_snapc); + ceph_wbc->last_snapc = ceph_wbc->snapc; - ceph_put_snap_context(last_snapc); - last_snapc = snapc; + return 0; +} - while (!done && index <= end) { - int num_ops = 0, op_idx; - unsigned i, nr_folios, max_pages, locked_pages = 0; - struct page **pages = NULL, **data_pages; - struct page *page; - pgoff_t strip_unit_end = 0; - u64 offset = 0, len = 0; - bool from_pool = false; +static inline +bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end; +} - max_pages = wsize >> PAGE_SHIFT; +static inline +bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc, + unsigned index) +{ + return index < ceph_wbc->nr_folios && + ceph_wbc->locked_pages < ceph_wbc->max_pages; +} -get_more_pages: - nr_folios = filemap_get_folios_tag(mapping, &index, - end, tag, &fbatch); - doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios); - if (!nr_folios && !locked_pages) - break; - for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { - struct folio *folio = fbatch.folios[i]; +static +int ceph_check_page_before_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_snap_context *pgsnapc; - page = &folio->page; - doutc(cl, "? %p idx %lu\n", page, page->index); - if (locked_pages == 0) - lock_page(page); /* first page */ - else if (!trylock_page(page)) - break; + /* only dirty folios, or our accounting breaks */ + if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) { + doutc(cl, "!dirty or !mapping %p\n", folio); + return -ENODATA; + } - /* only dirty pages, or our accounting breaks */ - if (unlikely(!PageDirty(page)) || - unlikely(page->mapping != mapping)) { - doutc(cl, "!dirty or !mapping %p\n", page); - unlock_page(page); - continue; - } - /* only if matching snap context */ - pgsnapc = page_snap_context(page); - if (pgsnapc != snapc) { - doutc(cl, "page snapc %p %lld != oldest %p %lld\n", - pgsnapc, pgsnapc->seq, snapc, snapc->seq); - if (!should_loop && - !ceph_wbc.head_snapc && - wbc->sync_mode != WB_SYNC_NONE) - should_loop = true; - unlock_page(page); - continue; + /* only if matching snap context */ + pgsnapc = page_snap_context(&folio->page); + if (pgsnapc != ceph_wbc->snapc) { + doutc(cl, "folio snapc %p %lld != oldest %p %lld\n", + pgsnapc, pgsnapc->seq, + ceph_wbc->snapc, ceph_wbc->snapc->seq); + + if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc && + wbc->sync_mode != WB_SYNC_NONE) + ceph_wbc->should_loop = true; + + return -ENODATA; + } + + if (folio_pos(folio) >= ceph_wbc->i_size) { + doutc(cl, "folio at %lu beyond eof %llu\n", + folio->index, ceph_wbc->i_size); + + if ((ceph_wbc->size_stable || + folio_pos(folio) >= i_size_read(inode)) && + folio_clear_dirty_for_io(folio)) + folio_invalidate(folio, 0, folio_size(folio)); + + return -ENODATA; + } + + if (ceph_wbc->strip_unit_end && + (folio->index > ceph_wbc->strip_unit_end)) { + doutc(cl, "end of strip unit %p\n", folio); + return -E2BIG; + } + + return 0; +} + +static inline +void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc, + unsigned int max_pages) +{ + ceph_wbc->pages = kmalloc_array(max_pages, + sizeof(*ceph_wbc->pages), + GFP_NOFS); + if (!ceph_wbc->pages) { + ceph_wbc->from_pool = true; + ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); + BUG_ON(!ceph_wbc->pages); + } +} + +static inline +void ceph_allocate_page_array(struct address_space *mapping, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objnum; + u64 objoff; + u32 xlen; + + /* prepare async write request */ + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_calc_file_object_mapping(&ci->i_layout, + ceph_wbc->offset, ceph_wbc->wsize, + &objnum, &objoff, &xlen); + + ceph_wbc->num_ops = 1; + ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT); + + BUG_ON(ceph_wbc->pages); + ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen); + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages); + + ceph_wbc->len = 0; +} + +static inline +bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc, + const struct folio *folio) +{ + return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT; +} + +static inline +bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->num_ops >= + (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS); +} + +static inline +bool is_write_congestion_happened(struct ceph_fs_client *fsc) +{ + return atomic_long_inc_return(&fsc->writeback_count) > + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb); +} + +static inline int move_dirty_folio_in_page_array(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct page **pages = ceph_wbc->pages; + unsigned int index = ceph_wbc->locked_pages; + gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS; + + if (IS_ENCRYPTED(inode)) { + pages[index] = fscrypt_encrypt_pagecache_blocks(folio, + PAGE_SIZE, + 0, + gfp_flags); + if (IS_ERR(pages[index])) { + if (PTR_ERR(pages[index]) == -EINVAL) { + pr_err_client(cl, "inode->i_blkbits=%hhu\n", + inode->i_blkbits); } - if (page_offset(page) >= ceph_wbc.i_size) { - doutc(cl, "folio at %lu beyond eof %llu\n", - folio->index, ceph_wbc.i_size); - if ((ceph_wbc.size_stable || - folio_pos(folio) >= i_size_read(inode)) && - folio_clear_dirty_for_io(folio)) - folio_invalidate(folio, 0, - folio_size(folio)); + + /* better not fail on first page! */ + BUG_ON(ceph_wbc->locked_pages == 0); + + pages[index] = NULL; + return PTR_ERR(pages[index]); + } + } else { + pages[index] = &folio->page; + } + + ceph_wbc->locked_pages++; + + return 0; +} + +static +int ceph_process_folio_batch(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct folio *folio = NULL; + unsigned i; + int rc = 0; + + for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) { + folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + doutc(cl, "? %p idx %lu, folio_test_writeback %#x, " + "folio_test_dirty %#x, folio_test_locked %#x\n", + folio, folio->index, folio_test_writeback(folio), + folio_test_dirty(folio), + folio_test_locked(folio)); + + if (folio_test_writeback(folio) || + folio_test_private_2(folio) /* [DEPRECATED] */) { + doutc(cl, "waiting on writeback %p\n", folio); + folio_wait_writeback(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ + continue; + } + + if (ceph_wbc->locked_pages == 0) + folio_lock(folio); + else if (!folio_trylock(folio)) + break; + + rc = ceph_check_page_before_write(mapping, wbc, + ceph_wbc, folio); + if (rc == -ENODATA) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } else if (rc == -E2BIG) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + break; + } + + if (!folio_clear_dirty_for_io(folio)) { + doutc(cl, "%p !folio_clear_dirty_for_io\n", folio); + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } + + /* + * We have something to write. If this is + * the first locked page this time through, + * calculate max possible write size and + * allocate a page array + */ + if (ceph_wbc->locked_pages == 0) { + ceph_allocate_page_array(mapping, ceph_wbc, folio); + } else if (!is_folio_index_contiguous(ceph_wbc, folio)) { + if (is_num_ops_too_big(ceph_wbc)) { + folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); - continue; - } - if (strip_unit_end && (page->index > strip_unit_end)) { - doutc(cl, "end of strip unit %p\n", page); - unlock_page(page); break; } - if (folio_test_writeback(folio) || - folio_test_private_2(folio) /* [DEPRECATED] */) { - if (wbc->sync_mode == WB_SYNC_NONE) { - doutc(cl, "%p under writeback\n", folio); - folio_unlock(folio); - continue; - } - doutc(cl, "waiting on writeback %p\n", folio); - folio_wait_writeback(folio); - folio_wait_private_2(folio); /* [DEPRECATED] */ - } - if (!clear_page_dirty_for_io(page)) { - doutc(cl, "%p !clear_page_dirty_for_io\n", page); - unlock_page(page); - continue; - } + ceph_wbc->num_ops++; + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_wbc->len = 0; + } - /* - * We have something to write. If this is - * the first locked page this time through, - * calculate max possinle write size and - * allocate a page array - */ - if (locked_pages == 0) { - u64 objnum; - u64 objoff; - u32 xlen; - - /* prepare async write request */ - offset = (u64)page_offset(page); - ceph_calc_file_object_mapping(&ci->i_layout, - offset, wsize, - &objnum, &objoff, - &xlen); - len = xlen; - - num_ops = 1; - strip_unit_end = page->index + - ((len - 1) >> PAGE_SHIFT); - - BUG_ON(pages); - max_pages = calc_pages_for(0, (u64)len); - pages = kmalloc_array(max_pages, - sizeof(*pages), - GFP_NOFS); - if (!pages) { - from_pool = true; - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); - BUG_ON(!pages); - } - - len = 0; - } else if (page->index != - (offset + len) >> PAGE_SHIFT) { - if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : - CEPH_OSD_MAX_OPS)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - break; - } - - num_ops++; - offset = (u64)page_offset(page); - len = 0; - } + /* note position of first page in fbatch */ + doutc(cl, "%llx.%llx will write folio %p idx %lu\n", + ceph_vinop(inode), folio, folio->index); - /* note position of first page in fbatch */ - doutc(cl, "%llx.%llx will write page %p idx %lu\n", - ceph_vinop(inode), page, page->index); - - if (atomic_long_inc_return(&fsc->writeback_count) > - CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) - fsc->write_congested = true; - - if (IS_ENCRYPTED(inode)) { - pages[locked_pages] = - fscrypt_encrypt_pagecache_blocks(page, - PAGE_SIZE, 0, - locked_pages ? GFP_NOWAIT : GFP_NOFS); - if (IS_ERR(pages[locked_pages])) { - if (PTR_ERR(pages[locked_pages]) == -EINVAL) - pr_err_client(cl, - "inode->i_blkbits=%hhu\n", - inode->i_blkbits); - /* better not fail on first page! */ - BUG_ON(locked_pages == 0); - pages[locked_pages] = NULL; - redirty_page_for_writepage(wbc, page); - unlock_page(page); - break; - } - ++locked_pages; - } else { - pages[locked_pages++] = page; - } + fsc->write_congested = is_write_congestion_happened(fsc); - fbatch.folios[i] = NULL; - len += thp_size(page); + rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, + folio); + if (rc) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + break; } - /* did we get anything? */ - if (!locked_pages) - goto release_folios; - if (i) { - unsigned j, n = 0; - /* shift unused page to beginning of fbatch */ - for (j = 0; j < nr_folios; j++) { - if (!fbatch.folios[j]) - continue; - if (n < j) - fbatch.folios[n] = fbatch.folios[j]; - n++; - } - fbatch.nr = n; + ceph_wbc->fbatch.folios[i] = NULL; + ceph_wbc->len += folio_size(folio); + } - if (nr_folios && i == nr_folios && - locked_pages < max_pages) { - doutc(cl, "reached end fbatch, trying for more\n"); - folio_batch_release(&fbatch); - goto get_more_pages; - } + ceph_wbc->processed_in_fbatch = i; + + return rc; +} + +static inline +void ceph_shift_unused_folios_left(struct folio_batch *fbatch) +{ + unsigned j, n = 0; + + /* shift unused page to beginning of fbatch */ + for (j = 0; j < folio_batch_count(fbatch); j++) { + if (!fbatch->folios[j]) + continue; + + if (n < j) { + fbatch->folios[n] = fbatch->folios[j]; } + n++; + } + + fbatch->nr = n; +} + +static +int ceph_submit_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_vino vino = ceph_vino(inode); + struct ceph_osd_request *req = NULL; + struct page *page = NULL; + bool caching = ceph_is_cache_enabled(inode); + u64 offset; + u64 len; + unsigned i; + new_request: - offset = ceph_fscrypt_page_offset(pages[0]); - len = wsize; + offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]); + len = ceph_wbc->wsize; + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, ceph_wbc->num_ops, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, false); + if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, 0, num_ops, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, false); - if (IS_ERR(req)) { - req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, 0, - min(num_ops, - CEPH_OSD_SLAB_OPS), - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE, - snapc, ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, true); - BUG_ON(IS_ERR(req)); + &ci->i_layout, vino, + offset, &len, 0, + min(ceph_wbc->num_ops, + CEPH_OSD_SLAB_OPS), + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, + ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, + true); + BUG_ON(IS_ERR(req)); + } + + page = ceph_wbc->pages[ceph_wbc->locked_pages - 1]; + BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) { + struct folio *folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + page = &folio->page; + redirty_page_for_writepage(wbc, page); + unlock_page(page); } - BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + - thp_size(pages[locked_pages - 1]) - offset); - if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { - rc = -EIO; - goto release_folios; + for (i = 0; i < ceph_wbc->locked_pages; i++) { + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + + if (!page) + continue; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); } - req->r_callback = writepages_finish; - req->r_inode = inode; - - /* Format the osd request message and submit the write */ - len = 0; - data_pages = pages; - op_idx = 0; - for (i = 0; i < locked_pages; i++) { - struct page *page = ceph_fscrypt_pagecache_page(pages[i]); - - u64 cur_offset = page_offset(page); - /* - * Discontinuity in page range? Ceph can handle that by just passing - * multiple extents in the write op. - */ - if (offset + len != cur_offset) { - /* If it's full, stop here */ - if (op_idx + 1 == req->r_num_ops) - break; - - /* Kick off an fscache write with what we have so far. */ - ceph_fscache_write_to_cache(inode, offset, len, caching); - - /* Start a new extent */ - osd_req_op_extent_dup_last(req, op_idx, - cur_offset - offset); - doutc(cl, "got pages at %llu~%llu\n", offset, - len); - osd_req_op_extent_osd_data_pages(req, op_idx, - data_pages, len, 0, - from_pool, false); - osd_req_op_extent_update(req, op_idx, len); - - len = 0; - offset = cur_offset; - data_pages = pages + i; - op_idx++; - } - set_page_writeback(page); - if (caching) - ceph_set_page_fscache(page); - len += thp_size(page); + ceph_osdc_put_request(req); + return -EIO; + } + + req->r_callback = writepages_finish; + req->r_inode = inode; + + /* Format the osd request message and submit the write */ + len = 0; + ceph_wbc->data_pages = ceph_wbc->pages; + ceph_wbc->op_idx = 0; + for (i = 0; i < ceph_wbc->locked_pages; i++) { + u64 cur_offset; + + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + cur_offset = page_offset(page); + + /* + * Discontinuity in page range? Ceph can handle that by just passing + * multiple extents in the write op. + */ + if (offset + len != cur_offset) { + /* If it's full, stop here */ + if (ceph_wbc->op_idx + 1 == req->r_num_ops) + break; + + /* Kick off an fscache write with what we have so far. */ + ceph_fscache_write_to_cache(inode, offset, len, caching); + + /* Start a new extent */ + osd_req_op_extent_dup_last(req, ceph_wbc->op_idx, + cur_offset - offset); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, + len, 0, + ceph_wbc->from_pool, + false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); + + len = 0; + offset = cur_offset; + ceph_wbc->data_pages = ceph_wbc->pages + i; + ceph_wbc->op_idx++; } - ceph_fscache_write_to_cache(inode, offset, len, caching); - - if (ceph_wbc.size_stable) { - len = min(len, ceph_wbc.i_size - offset); - } else if (i == locked_pages) { - /* writepages_finish() clears writeback pages - * according to the data length, so make sure - * data length covers all locked pages */ - u64 min_len = len + 1 - thp_size(page); - len = get_writepages_data_length(inode, pages[i - 1], - offset); - len = max(len, min_len); + + set_page_writeback(page); + + if (caching) + ceph_set_page_fscache(page); + + len += thp_size(page); + } + + ceph_fscache_write_to_cache(inode, offset, len, caching); + + if (ceph_wbc->size_stable) { + len = min(len, ceph_wbc->i_size - offset); + } else if (i == ceph_wbc->locked_pages) { + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + u64 min_len = len + 1 - thp_size(page); + len = get_writepages_data_length(inode, + ceph_wbc->pages[i - 1], + offset); + len = max(len, min_len); + } + + if (IS_ENCRYPTED(inode)) + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + if (IS_ENCRYPTED(inode) && + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) { + pr_warn_client(cl, + "bad encrypted write offset=%lld len=%llu\n", + offset, len); + } + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, len, + 0, ceph_wbc->from_pool, false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); + + BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops); + + ceph_wbc->from_pool = false; + if (i < ceph_wbc->locked_pages) { + BUG_ON(ceph_wbc->num_ops <= req->r_num_ops); + ceph_wbc->num_ops -= req->r_num_ops; + ceph_wbc->locked_pages -= i; + + /* allocate new pages array for next request */ + ceph_wbc->data_pages = ceph_wbc->pages; + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages); + memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + memset(ceph_wbc->data_pages + i, 0, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + } else { + BUG_ON(ceph_wbc->num_ops != req->r_num_ops); + /* request message now owns the pages array */ + ceph_wbc->pages = NULL; + } + + req->r_mtime = inode_get_mtime(inode); + ceph_osdc_start_request(&fsc->client->osdc, req); + req = NULL; + + wbc->nr_to_write -= i; + if (ceph_wbc->pages) + goto new_request; + + return 0; +} + +static +void ceph_wait_until_current_writes_complete(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct page *page; + unsigned i, nr; + + if (wbc->sync_mode != WB_SYNC_NONE && + ceph_wbc->start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc->head_snapc) { + ceph_wbc->index = 0; + + while ((ceph_wbc->index <= ceph_wbc->end) && + (nr = filemap_get_folios_tag(mapping, + &ceph_wbc->index, + (pgoff_t)-1, + PAGECACHE_TAG_WRITEBACK, + &ceph_wbc->fbatch))) { + for (i = 0; i < nr; i++) { + page = &ceph_wbc->fbatch.folios[i]->page; + if (page_snap_context(page) != ceph_wbc->snapc) + continue; + wait_on_page_writeback(page); + } + + folio_batch_release(&ceph_wbc->fbatch); + cond_resched(); } - if (IS_ENCRYPTED(inode)) - len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + } +} + +/* + * initiate async writeback + */ +static int ceph_writepages_start(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_writeback_ctl ceph_wbc; + int rc = 0; - doutc(cl, "got pages at %llu~%llu\n", offset, len); + if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) + return 0; - if (IS_ENCRYPTED(inode) && - ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) - pr_warn_client(cl, - "bad encrypted write offset=%lld len=%llu\n", - offset, len); - - osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, - 0, from_pool, false); - osd_req_op_extent_update(req, op_idx, len); - - BUG_ON(op_idx + 1 != req->r_num_ops); - - from_pool = false; - if (i < locked_pages) { - BUG_ON(num_ops <= req->r_num_ops); - num_ops -= req->r_num_ops; - locked_pages -= i; - - /* allocate new pages array for next request */ - data_pages = pages; - pages = kmalloc_array(locked_pages, sizeof(*pages), - GFP_NOFS); - if (!pages) { - from_pool = true; - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); - BUG_ON(!pages); + doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + + if (is_forced_umount(mapping)) { + /* we're in a forced umount, don't write! */ + return -EIO; + } + + ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto out; + } + +retry: + rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc); + if (rc == -ENODATA) { + /* hmm, why does writepages get called when there + is no dirty data? */ + rc = 0; + goto dec_osd_stopping_blocker; + } + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end); + + while (!has_writeback_done(&ceph_wbc)) { + ceph_wbc.locked_pages = 0; + ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT; + +get_more_pages: + ceph_folio_batch_reinit(&ceph_wbc); + + ceph_wbc.nr_folios = filemap_get_folios_tag(mapping, + &ceph_wbc.index, + ceph_wbc.end, + ceph_wbc.tag, + &ceph_wbc.fbatch); + doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n", + ceph_wbc.tag, ceph_wbc.nr_folios); + + if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages) + break; + +process_folio_batch: + rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + if (rc) + goto release_folios; + + /* did we get anything? */ + if (!ceph_wbc.locked_pages) + goto release_folios; + + if (ceph_wbc.processed_in_fbatch) { + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); + + if (folio_batch_count(&ceph_wbc.fbatch) == 0 && + ceph_wbc.locked_pages < ceph_wbc.max_pages) { + doutc(cl, "reached end fbatch, trying for more\n"); + goto get_more_pages; } - memcpy(pages, data_pages + i, - locked_pages * sizeof(*pages)); - memset(data_pages + i, 0, - locked_pages * sizeof(*pages)); - } else { - BUG_ON(num_ops != req->r_num_ops); - index = pages[i - 1]->index + 1; - /* request message now owns the pages array */ - pages = NULL; } - req->r_mtime = inode_get_mtime(inode); - ceph_osdc_start_request(&fsc->client->osdc, req); - req = NULL; + rc = ceph_submit_write(mapping, wbc, &ceph_wbc); + if (rc) + goto release_folios; + + ceph_wbc.locked_pages = 0; + ceph_wbc.strip_unit_end = 0; - wbc->nr_to_write -= i; - if (pages) - goto new_request; + if (folio_batch_count(&ceph_wbc.fbatch) > 0) { + ceph_wbc.nr_folios = + folio_batch_count(&ceph_wbc.fbatch); + goto process_folio_batch; + } /* * We stop writing back only if we are not doing @@ -1377,61 +1713,44 @@ new_request: * we tagged for writeback prior to entering this loop. */ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) - done = true; + ceph_wbc.done = true; release_folios: doutc(cl, "folio_batch release on %d folios (%p)\n", - (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL); - folio_batch_release(&fbatch); + (int)ceph_wbc.fbatch.nr, + ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL); + folio_batch_release(&ceph_wbc.fbatch); } - if (should_loop && !done) { + if (ceph_wbc.should_loop && !ceph_wbc.done) { /* more to do; loop back to beginning of file */ doutc(cl, "looping back to beginning of file\n"); - end = start_index - 1; /* OK even when start_index == 0 */ + /* OK even when start_index == 0 */ + ceph_wbc.end = ceph_wbc.start_index - 1; /* to write dirty pages associated with next snapc, * we need to wait until current writes complete */ - if (wbc->sync_mode != WB_SYNC_NONE && - start_index == 0 && /* all dirty pages were checked */ - !ceph_wbc.head_snapc) { - struct page *page; - unsigned i, nr; - index = 0; - while ((index <= end) && - (nr = filemap_get_folios_tag(mapping, &index, - (pgoff_t)-1, - PAGECACHE_TAG_WRITEBACK, - &fbatch))) { - for (i = 0; i < nr; i++) { - page = &fbatch.folios[i]->page; - if (page_snap_context(page) != snapc) - continue; - wait_on_page_writeback(page); - } - folio_batch_release(&fbatch); - cond_resched(); - } - } + ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc); - start_index = 0; - index = 0; + ceph_wbc.start_index = 0; + ceph_wbc.index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = ceph_wbc.index; + +dec_osd_stopping_blocker: + ceph_dec_osd_stopping_blocker(fsc->mdsc); out: - ceph_osdc_put_request(req); - ceph_put_snap_context(last_snapc); + ceph_put_snap_context(ceph_wbc.last_snapc); doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode), rc); + return rc; } - - /* * See if a given @snapc is either writeable, or already written. */ @@ -1447,56 +1766,56 @@ static int context_is_writeable_or_written(struct inode *inode, /** * ceph_find_incompatible - find an incompatible context and return it - * @page: page being dirtied + * @folio: folio being dirtied * - * We are only allowed to write into/dirty a page if the page is + * We are only allowed to write into/dirty a folio if the folio is * clean, or already dirty within the same snap context. Returns a * conflicting context if there is one, NULL if there isn't, or a * negative error code on other errors. * - * Must be called with page lock held. + * Must be called with folio lock held. */ static struct ceph_snap_context * -ceph_find_incompatible(struct page *page) +ceph_find_incompatible(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); if (ceph_inode_is_shutdown(inode)) { - doutc(cl, " %llx.%llx page %p is shutdown\n", - ceph_vinop(inode), page); + doutc(cl, " %llx.%llx folio %p is shutdown\n", + ceph_vinop(inode), folio); return ERR_PTR(-ESTALE); } for (;;) { struct ceph_snap_context *snapc, *oldest; - wait_on_page_writeback(page); + folio_wait_writeback(folio); - snapc = page_snap_context(page); + snapc = page_snap_context(&folio->page); if (!snapc || snapc == ci->i_head_snapc) break; /* - * this page is already dirty in another (older) snap + * this folio is already dirty in another (older) snap * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { /* not writeable -- return it for the caller to deal with */ ceph_put_snap_context(oldest); - doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n", - ceph_vinop(inode), page, snapc); + doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n", + ceph_vinop(inode), folio, snapc); return ceph_get_snap_context(snapc); } ceph_put_snap_context(oldest); - /* yay, writeable, do it now (without dropping page lock) */ - doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n", - ceph_vinop(inode), page, snapc); - if (clear_page_dirty_for_io(page)) { - int r = writepage_nounlock(page, NULL); + /* yay, writeable, do it now (without dropping folio lock) */ + doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n", + ceph_vinop(inode), folio, snapc); + if (folio_clear_dirty_for_io(folio)) { + int r = write_folio_nounlock(folio, NULL); if (r < 0) return ERR_PTR(r); } @@ -1511,7 +1830,7 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(*foliop, 0)); + snapc = ceph_find_incompatible(*foliop); if (snapc) { int r; @@ -1594,7 +1913,6 @@ out: const struct address_space_operations ceph_aops = { .read_folio = netfs_read_folio, .readahead = netfs_readahead, - .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, .write_end = ceph_write_end, @@ -1602,6 +1920,7 @@ const struct address_space_operations ceph_aops = { .invalidate_folio = ceph_invalidate_folio, .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, + .migrate_folio = filemap_migrate_folio, }; static void ceph_block_sigs(sigset_t *oldset) @@ -1718,8 +2037,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_cap_flush *prealloc_cf; - struct page *page = vmf->page; - loff_t off = page_offset(page); + struct folio *folio = page_folio(vmf->page); + loff_t off = folio_pos(folio); loff_t size = i_size_read(inode); size_t len; int want, got, err; @@ -1736,10 +2055,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); - if (off + thp_size(page) <= size) - len = thp_size(page); + if (off + folio_size(folio) <= size) + len = folio_size(folio); else - len = offset_in_thp(page, size); + len = offset_in_folio(folio, size); doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n", ceph_vinop(inode), off, len, size); @@ -1756,30 +2075,30 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode), off, len, ceph_cap_string(got)); - /* Update time before taking page lock */ + /* Update time before taking folio lock */ file_update_time(vma->vm_file); inode_inc_iversion_raw(inode); do { struct ceph_snap_context *snapc; - lock_page(page); + folio_lock(folio); - if (page_mkwrite_check_truncate(page, inode) < 0) { - unlock_page(page); + if (folio_mkwrite_check_truncate(folio, inode) < 0) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; break; } - snapc = ceph_find_incompatible(page); + snapc = ceph_find_incompatible(folio); if (!snapc) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); + /* success. we'll keep the folio locked. */ + folio_mark_dirty(folio); ret = VM_FAULT_LOCKED; break; } - unlock_page(page); + folio_unlock(folio); if (IS_ERR(snapc)) { ret = VM_FAULT_SIGBUS; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 62e99e65250d..a321aa6d0ed2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,17 +141,18 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, if (ptr_pos >= i_size_read(dir)) return NULL; - if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) { + if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) { ceph_readdir_cache_release(cache_ctl); - cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); - if (!cache_ctl->page) { - doutc(cl, " page %lu not found\n", ptr_pgoff); + cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff); + if (IS_ERR(cache_ctl->folio)) { + cache_ctl->folio = NULL; + doutc(cl, " folio %lu not found\n", ptr_pgoff); return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by - i_rwsem, no need to use page lock */ - unlock_page(cache_ctl->page); - cache_ctl->dentries = kmap(cache_ctl->page); + i_rwsem, no need to use folio lock */ + folio_unlock(cache_ctl->folio); + cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0); } cache_ctl->index = idx & idx_mask; @@ -1092,19 +1093,20 @@ out: return err; } -static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; + struct dentry *ret; int err; int op; err = ceph_wait_on_conflict_unlink(dentry); if (err) - return err; + return ERR_PTR(err); if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ @@ -1116,32 +1118,32 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, ceph_vinop(dir), dentry, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { - err = -EROFS; + ret = ERR_PTR(-EROFS); goto out; } if (op == CEPH_MDS_OP_MKDIR && ceph_quota_is_max_files_exceeded(dir)) { - err = -EDQUOT; + ret = ERR_PTR(-EDQUOT); goto out; } if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir)) { - err = -ENOKEY; + ret = ERR_PTR(-ENOKEY); goto out; } req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { - err = PTR_ERR(req); + ret = ERR_CAST(req); goto out; } mode |= S_IFDIR; req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); if (IS_ERR(req->r_new_inode)) { - err = PTR_ERR(req->r_new_inode); + ret = ERR_CAST(req->r_new_inode); req->r_new_inode = NULL; goto out_req; } @@ -1165,15 +1167,22 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, !req->r_reply_info.head->is_target && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); + ret = ERR_PTR(err); out_req: + if (!IS_ERR(ret) && req->r_dentry != dentry) + /* Some other dentry was spliced in */ + ret = dget(req->r_dentry); ceph_mdsc_put_request(req); out: - if (!err) + if (!IS_ERR(ret)) { + if (ret) + dentry = ret; ceph_init_inode_acls(d_inode(dentry), &as_ctx); - else + } else { d_drop(dentry); + } ceph_release_acl_sec_ctx(&as_ctx); - return err; + return ret; } static int ceph_link(struct dentry *old_dentry, struct inode *dir, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7dd6c2275085..6ac2bd555e86 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1845,10 +1845,9 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) { - if (ctl->page) { - kunmap(ctl->page); - put_page(ctl->page); - ctl->page = NULL; + if (ctl->folio) { + folio_release_kmap(ctl->folio, ctl->dentries); + ctl->folio = NULL; } } @@ -1862,20 +1861,26 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, unsigned idx = ctl->index % nsize; pgoff_t pgoff = ctl->index / nsize; - if (!ctl->page || pgoff != ctl->page->index) { + if (!ctl->folio || pgoff != ctl->folio->index) { ceph_readdir_cache_release(ctl); + fgf_t fgf = FGP_LOCK; + if (idx == 0) - ctl->page = grab_cache_page(&dir->i_data, pgoff); - else - ctl->page = find_lock_page(&dir->i_data, pgoff); - if (!ctl->page) { + fgf |= FGP_ACCESSED | FGP_CREAT; + + ctl->folio = __filemap_get_folio(&dir->i_data, pgoff, + fgf, mapping_gfp_mask(&dir->i_data)); + if (IS_ERR(ctl->folio)) { + int err = PTR_ERR(ctl->folio); + + ctl->folio = NULL; ctl->index = -1; - return idx == 0 ? -ENOMEM : 0; + return idx == 0 ? err : 0; } /* reading/filling the cache are serialized by - * i_rwsem, no need to use page lock */ - unlock_page(ctl->page); - ctl->dentries = kmap(ctl->page); + * i_rwsem, no need to use folio lock */ + folio_unlock(ctl->folio); + ctl->dentries = kmap_local_folio(ctl->folio, 0); if (idx == 0) memset(ctl->dentries, 0, PAGE_SIZE); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 54b3421501e9..230e0c3f341f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5489,6 +5489,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->stopping_lock); atomic_set(&mdsc->stopping_blockers, 0); init_completion(&mdsc->stopping_waiter); + atomic64_set(&mdsc->dirty_folios, 0); + init_waitqueue_head(&mdsc->flush_end_wq); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 7c9fee9e80d4..3e2a6fa7c19a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -458,6 +458,9 @@ struct ceph_mds_client { atomic_t stopping_blockers; struct completion stopping_waiter; + atomic64_t dirty_folios; + wait_queue_head_t flush_end_wq; + atomic64_t quotarealms_count; /* # realms with quota */ /* * We keep a list of inodes we don't see in the mountpoint but that we diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4344e1f11806..f3951253e393 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1563,6 +1563,17 @@ static void ceph_kill_sb(struct super_block *s) */ sync_filesystem(s); + if (atomic64_read(&mdsc->dirty_folios) > 0) { + wait_queue_head_t *wq = &mdsc->flush_end_wq; + long timeleft = wait_event_killable_timeout(*wq, + atomic64_read(&mdsc->dirty_folios) <= 0, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn_client(cl, "umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn_client(cl, "umount was killed, %ld\n", timeleft); + } + spin_lock(&mdsc->stopping_lock); mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; wait = !!atomic_read(&mdsc->stopping_blockers); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7fa1e7be50e4..bb0db0cc8003 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -903,7 +903,7 @@ ceph_find_rw_context(struct ceph_file_info *cf) } struct ceph_readdir_cache_control { - struct page *page; + struct folio *folio; struct dentry **dentries; int index; }; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index a3e2dfeedfbf..ab69d8f0cec2 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -166,8 +166,8 @@ err_out: return error; } -static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *de, umode_t mode) +static struct dentry *coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *de, umode_t mode) { struct inode *inode; struct coda_vattr attrs; @@ -177,14 +177,14 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct CodaFid newfid; if (is_root_inode(dir) && coda_iscontrol(name, len)) - return -EPERM; + return ERR_PTR(-EPERM); attrs.va_mode = mode; - error = venus_mkdir(dir->i_sb, coda_i2f(dir), + error = venus_mkdir(dir->i_sb, coda_i2f(dir), name, len, &newfid, &attrs); if (error) goto err_out; - + inode = coda_iget(dir->i_sb, &newfid, &attrs); if (IS_ERR(inode)) { error = PTR_ERR(inode); @@ -195,10 +195,10 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, coda_dir_inc_nlink(dir); coda_dir_update_mtime(dir); d_instantiate(de, inode); - return 0; + return NULL; err_out: d_drop(de); - return error; + return ERR_PTR(error); } /* try to make de an entry in dir_inodde linked to source_de */ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7d10278db30d..5568cb74b322 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1280,8 +1280,8 @@ out_root_unlock: } EXPORT_SYMBOL(configfs_depend_item_unlocked); -static int configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int ret = 0; int module_got = 0; @@ -1461,7 +1461,7 @@ out_put: put_fragment(frag); out: - return ret; + return ERR_PTR(ret); } static int configfs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/coredump.c b/fs/coredump.c index 4375c70144d0..c33c177a701b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -926,14 +926,23 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, { unsigned long addr; struct page *dump_page; + int locked, ret; dump_page = dump_page_alloc(); if (!dump_page) return 0; + ret = 0; + locked = 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; + if (!locked) { + if (mmap_read_lock_killable(current->mm)) + goto out; + locked = 1; + } + /* * To avoid having to allocate page tables for virtual address * ranges that have never been used yet, and also to make it @@ -941,21 +950,38 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, * NULL when encountering an empty page table entry that would * otherwise have been filled with the zero page. */ - page = get_dump_page(addr); + page = get_dump_page(addr, &locked); if (page) { + if (locked) { + mmap_read_unlock(current->mm); + locked = 0; + } int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page)); put_page(page); - if (stop) { - dump_page_free(dump_page); - return 0; - } + if (stop) + goto out; } else { dump_skip(cprm, PAGE_SIZE); } + + if (dump_interrupted()) + goto out; + + if (!need_resched()) + continue; + if (locked) { + mmap_read_unlock(current->mm); + locked = 0; + } cond_resched(); } + ret = 1; +out: + if (locked) + mmap_read_unlock(current->mm); + dump_page_free(dump_page); - return 1; + return ret; } #endif @@ -1016,7 +1042,9 @@ static const struct ctl_table coredump_sysctls[] = { .data = &core_pipe_limit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "core_file_note_size_limit", diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 5aff5934baa1..b5dfb0aa405a 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -3,6 +3,7 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO select CRYPTO_HASH + select CRYPTO_HKDF select CRYPTO_SKCIPHER select CRYPTO_LIB_SHA256 select KEYS @@ -24,20 +25,16 @@ config FS_ENCRYPTION # # Also note that this option only pulls in the generic implementations of the # algorithms, not any per-architecture optimized implementations. It is -# strongly recommended to enable optimized implementations too. It is safe to -# disable these generic implementations if corresponding optimized -# implementations will always be available too; for this reason, these are soft -# dependencies ('imply' rather than 'select'). Only disable these generic -# implementations if you're sure they will never be needed, though. +# strongly recommended to enable optimized implementations too. config FS_ENCRYPTION_ALGS tristate - imply CRYPTO_AES - imply CRYPTO_CBC - imply CRYPTO_CTS - imply CRYPTO_ECB - imply CRYPTO_HMAC - imply CRYPTO_SHA512 - imply CRYPTO_XTS + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_ECB + select CRYPTO_HMAC + select CRYPTO_SHA512 + select CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT bool "Enable fscrypt to use inline crypto" diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 328470d40dec..b74b5937e695 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -153,8 +153,8 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page - * @page: the locked pagecache page containing the data to encrypt + * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache folio + * @folio: the locked pagecache folio containing the data to encrypt * @len: size of the data to encrypt, in bytes * @offs: offset within @page of the data to encrypt, in bytes * @gfp_flags: memory allocation flags; see details below @@ -177,23 +177,21 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, * * Return: the new encrypted bounce page on success; an ERR_PTR() on failure */ -struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags) - +struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs, gfp_t gfp_flags) { - const struct inode *inode = page->mapping->host; + const struct inode *inode = folio->mapping->host; const struct fscrypt_inode_info *ci = inode->i_crypt_info; const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; - u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) + + u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + (offs >> du_bits); unsigned int i; int err; - if (WARN_ON_ONCE(!PageLocked(page))) + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + if (WARN_ON_ONCE(!folio_test_locked(folio))) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) @@ -205,7 +203,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, for (i = offs; i < offs + len; i += du_size, index++) { err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index, - page, ciphertext_page, + &folio->page, ciphertext_page, du_size, i, gfp_flags); if (err) { fscrypt_free_bounce_page(ciphertext_page); @@ -213,7 +211,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, } } SetPagePrivate(ciphertext_page); - set_page_private(ciphertext_page, (unsigned long)page); + set_page_private(ciphertext_page, (unsigned long)folio); return ciphertext_page; } EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 5a384dad2c72..855a0f4b7318 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation - * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): - * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". - * * This is used to derive keys from the fscrypt master keys. * * Copyright 2019 Google LLC @@ -11,6 +7,7 @@ #include <crypto/hash.h> #include <crypto/sha2.h> +#include <crypto/hkdf.h> #include "fscrypt_private.h" @@ -44,20 +41,6 @@ * there's no way to persist a random salt per master key from kernel mode. */ -/* HKDF-Extract (RFC 5869 section 2.2), unsalted */ -static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, - unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) -{ - static const u8 default_salt[HKDF_HASHLEN]; - int err; - - err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); - if (err) - return err; - - return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); -} - /* * Compute HKDF-Extract using the given master key as the input keying material, * and prepare an HMAC transform object keyed by the resulting pseudorandom key. @@ -69,6 +52,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size) { struct crypto_shash *hmac_tfm; + static const u8 default_salt[HKDF_HASHLEN]; u8 prk[HKDF_HASHLEN]; int err; @@ -84,7 +68,8 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, goto err_free_tfm; } - err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); + err = hkdf_extract(hmac_tfm, master_key, master_key_size, + default_salt, HKDF_HASHLEN, prk); if (err) goto err_free_tfm; @@ -118,61 +103,21 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, u8 *okm, unsigned int okmlen) { SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); - u8 prefix[9]; - unsigned int i; + u8 *full_info; int err; - const u8 *prev = NULL; - u8 counter = 1; - u8 tmp[HKDF_HASHLEN]; - - if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) - return -EINVAL; + full_info = kzalloc(infolen + 9, GFP_KERNEL); + if (!full_info) + return -ENOMEM; desc->tfm = hkdf->hmac_tfm; - memcpy(prefix, "fscrypt\0", 8); - prefix[8] = context; - - for (i = 0; i < okmlen; i += HKDF_HASHLEN) { - - err = crypto_shash_init(desc); - if (err) - goto out; - - if (prev) { - err = crypto_shash_update(desc, prev, HKDF_HASHLEN); - if (err) - goto out; - } - - err = crypto_shash_update(desc, prefix, sizeof(prefix)); - if (err) - goto out; - - err = crypto_shash_update(desc, info, infolen); - if (err) - goto out; - - BUILD_BUG_ON(sizeof(counter) != 1); - if (okmlen - i < HKDF_HASHLEN) { - err = crypto_shash_finup(desc, &counter, 1, tmp); - if (err) - goto out; - memcpy(&okm[i], tmp, okmlen - i); - memzero_explicit(tmp, sizeof(tmp)); - } else { - err = crypto_shash_finup(desc, &counter, 1, &okm[i]); - if (err) - goto out; - } - counter++; - prev = &okm[i]; - } - err = 0; -out: - if (unlikely(err)) - memzero_explicit(okm, okmlen); /* so caller doesn't need to */ - shash_desc_zero(desc); + memcpy(full_info, "fscrypt\0", 8); + full_info[8] = context; + memcpy(full_info + 9, info, infolen); + + err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9, + okm, okmlen); + kfree_sensitive(full_info); return err; } diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 40de69860dcf..7fa53d30aec3 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -130,6 +130,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); + crypto_cfg.key_type = BLK_CRYPTO_KEY_TYPE_RAW; devs = fscrypt_get_devices(sb, &num_devs); if (IS_ERR(devs)) @@ -166,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, if (!blk_key) return -ENOMEM; - err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, + err = blk_crypto_init_key(blk_key, raw_key, ci->ci_mode->keysize, + BLK_CRYPTO_KEY_TYPE_RAW, crypto_mode, fscrypt_get_dun_bytes(ci), 1U << ci->ci_data_unit_bits); if (err) { @@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ -static s64 dax_unshare_iter(struct iomap_iter *iter) +static int dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) u64 copy_len = iomap_length(iter); u32 mod; int id = 0; - s64 ret = 0; + s64 ret; void *daddr = NULL, *saddr = NULL; if (!iomap_want_unshare_iter(iter)) - return iomap_length(iter); + return iomap_iter_advance_full(iter); /* * Extend the file range to be aligned to fsblock/pagesize, because @@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) - ret = iomap_length(iter); - else + if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0) ret = -EIO; out_unlock: dax_read_unlock(id); - return dax_mem2blk_err(ret); + if (ret < 0) + return dax_mem2blk_err(ret); + return iomap_iter_advance_full(iter); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, @@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_unshare_iter(&iter); + iter.status = dax_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(dax_file_unshare); @@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) return ret; } -static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; u64 length = iomap_length(iter); - s64 written = 0; + int ret; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return length; + return iomap_iter_advance(iter, &length); /* * invalidate the pages whose sharing state is to be changed @@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) */ if (iomap->flags & IOMAP_F_SHARED) invalidate_inode_pages2_range(iter->inode->i_mapping, - pos >> PAGE_SHIFT, - (pos + length - 1) >> PAGE_SHIFT); + iter->pos >> PAGE_SHIFT, + (iter->pos + length - 1) >> PAGE_SHIFT); do { + loff_t pos = iter->pos; unsigned offset = offset_in_page(pos); - unsigned size = min_t(u64, PAGE_SIZE - offset, length); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); - long rc; int id; + length = min_t(u64, PAGE_SIZE - offset, length); + id = dax_read_lock(); - if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); + if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE) + ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else - rc = dax_memzero(iter, pos, size); + ret = dax_memzero(iter, pos, length); dax_read_unlock(id); - if (rc < 0) - return rc; - pos += size; - length -= size; - written += size; + if (ret < 0) + return ret; + + ret = iomap_iter_advance(iter, &length); + if (ret) + return ret; } while (length > 0); if (did_zero) *did_zero = true; - return written; + return ret; } int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, @@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_zero_iter(&iter, did_zero); + iter.status = dax_zero_iter(&iter, did_zero); return ret; } EXPORT_SYMBOL_GPL(dax_zero_range); @@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(dax_truncate_page); -static loff_t dax_iomap_iter(const struct iomap_iter *iomi, - struct iov_iter *iter) +static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iomi); @@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (pos >= end) return 0; - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) - return iov_iter_zero(min(length, end - pos), iter); + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { + done = iov_iter_zero(min(length, end - pos), iter); + return iomap_iter_advance(iomi, &done); + } } /* @@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } id = dax_read_lock(); - while (pos < end) { + while ((pos = iomi->pos) < end) { unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); @@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); - pos += xfer; - length -= xfer; - done += xfer; - - if (xfer == 0) + length = xfer; + ret = iomap_iter_advance(iomi, &length); + if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; } dax_read_unlock(id); - return done ? done : ret; + return ret; } /** @@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iomi, ops)) > 0) - iomi.processed = dax_iomap_iter(&iomi, iter); + iomi.status = dax_iomap_iter(&iomi, iter); done = iomi.pos - iocb->ki_pos; iocb->ki_pos = iomi.pos; @@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, while ((error = iomap_iter(&iter, ops)) > 0) { if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { - iter.processed = -EIO; /* fs corruption? */ + iter.status = -EIO; /* fs corruption? */ continue; } @@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ret |= VM_FAULT_MAJOR; } - if (!(ret & VM_FAULT_ERROR)) - iter.processed = PAGE_SIZE; + if (!(ret & VM_FAULT_ERROR)) { + u64 length = PAGE_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } if (iomap_errp) @@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); - if (ret != VM_FAULT_FALLBACK) - iter.processed = PMD_SIZE; + if (ret != VM_FAULT_FALLBACK) { + u64 length = PMD_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } unlock_entry: @@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); -static loff_t dax_range_compare_iter(struct iomap_iter *it_src, +static int dax_range_compare_iter(struct iomap_iter *it_src, struct iomap_iter *it_dest, u64 len, bool *same) { const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + u64 dest_len; void *saddr, *daddr; int id, ret; @@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { *same = true; - return len; + goto advance; } if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { @@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (!*same) len = 0; dax_read_unlock(id); - return len; + +advance: + dest_len = len; + ret = iomap_iter_advance(it_src, &len); + if (!ret) + ret = iomap_iter_advance(it_dest, &dest_len); + return ret; out_unlock: dax_read_unlock(id); @@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret, compared = 0; + int ret, status; while ((ret = iomap_iter(&src_iter, ops)) > 0 && (ret = iomap_iter(&dst_iter, ops)) > 0) { - compared = dax_range_compare_iter(&src_iter, &dst_iter, + status = dax_range_compare_iter(&src_iter, &dst_iter, min(src_iter.len, dst_iter.len), same); - if (compared < 0) + if (status < 0) return ret; - src_iter.processed = dst_iter.processed = compared; + src_iter.status = dst_iter.status = status; } return ret; } diff --git a/fs/dcache.c b/fs/dcache.c index e3634916ffb9..bd5aa136153a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -73,8 +73,13 @@ * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ -int sysctl_vfs_cache_pressure __read_mostly = 100; -EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); +static int sysctl_vfs_cache_pressure __read_mostly = 100; + +unsigned long vfs_pressure_ratio(unsigned long val) +{ + return mult_frac(val, sysctl_vfs_cache_pressure, 100); +} +EXPORT_SYMBOL_GPL(vfs_pressure_ratio); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); @@ -211,8 +216,20 @@ static const struct ctl_table fs_dcache_sysctls[] = { }, }; +static const struct ctl_table vm_dcache_sysctls[] = { + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +}; + static int __init init_fs_dcache_sysctls(void) { + register_sysctl_init("vm", vm_dcache_sysctls); register_sysctl_init("fs", fs_dcache_sysctls); return 0; } @@ -2480,7 +2497,8 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); - wake_up_all(d_wait); + if (wq_has_sleeper(d_wait)) + wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) @@ -2687,52 +2705,6 @@ void d_add(struct dentry *entry, struct inode *inode) } EXPORT_SYMBOL(d_add); -/** - * d_exact_alias - find and hash an exact unhashed alias - * @entry: dentry to add - * @inode: The inode to go with this dentry - * - * If an unhashed dentry with the same name/parent and desired - * inode already exists, hash and return it. Otherwise, return - * NULL. - * - * Parent directory should be locked. - */ -struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) -{ - struct dentry *alias; - unsigned int hash = entry->d_name.hash; - - spin_lock(&inode->i_lock); - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { - /* - * Don't need alias->d_lock here, because aliases with - * d_parent == entry->d_parent are not subject to name or - * parent changes, because the parent inode i_mutex is held. - */ - if (alias->d_name.hash != hash) - continue; - if (alias->d_parent != entry->d_parent) - continue; - if (!d_same_name(alias, entry->d_parent, &entry->d_name)) - continue; - spin_lock(&alias->d_lock); - if (!d_unhashed(alias)) { - spin_unlock(&alias->d_lock); - alias = NULL; - } else { - dget_dlock(alias); - __d_rehash(alias); - spin_unlock(&alias->d_lock); - } - spin_unlock(&inode->i_lock); - return alias; - } - spin_unlock(&inode->i_lock); - return NULL; -} -EXPORT_SYMBOL(d_exact_alias); - static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 1096ff8562fa..42e4d6eeb29f 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -12,6 +12,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> @@ -21,7 +23,6 @@ #include <linux/magic.h> #include <linux/idr.h> #include <linux/devpts_fs.h> -#include <linux/parser.h> #include <linux/fsnotify.h> #include <linux/seq_file.h> @@ -87,14 +88,14 @@ enum { Opt_err }; -static const match_table_t tokens = { - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_mode, "mode=%o"}, - {Opt_ptmxmode, "ptmxmode=%o"}, - {Opt_newinstance, "newinstance"}, - {Opt_max, "max=%d"}, - {Opt_err, NULL} +static const struct fs_parameter_spec devpts_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_s32 ("max", Opt_max), + fsparam_u32oct ("mode", Opt_mode), + fsparam_flag ("newinstance", Opt_newinstance), + fsparam_u32oct ("ptmxmode", Opt_ptmxmode), + fsparam_u32 ("uid", Opt_uid), + {} }; struct pts_fs_info { @@ -214,93 +215,48 @@ void devpts_release(struct pts_fs_info *fsi) deactivate_super(fsi->sb); } -#define PARSE_MOUNT 0 -#define PARSE_REMOUNT 1 - /* - * parse_mount_options(): - * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. - * - * Note: @data may be NULL (in which case all options are set to default). + * devpts_parse_param - Parse mount parameters */ -static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) +static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - kuid_t uid; - kgid_t gid; - - opts->setuid = 0; - opts->setgid = 0; - opts->uid = GLOBAL_ROOT_UID; - opts->gid = GLOBAL_ROOT_GID; - opts->mode = DEVPTS_DEFAULT_MODE; - opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; - opts->max = NR_UNIX98_PTY_MAX; - - /* Only allow instances mounted from the initial mount - * namespace to tap the reserve pool of ptys. - */ - if (op == PARSE_MOUNT) - opts->reserve = - (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns); - - while ((p = strsep(&data, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - int option; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) - return -EINVAL; - opts->uid = uid; - opts->setuid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) - return -EINVAL; - opts->gid = gid; - opts->setgid = 1; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; - case Opt_ptmxmode: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->ptmxmode = option & S_IALLUGO; - break; - case Opt_newinstance: - break; - case Opt_max: - if (match_int(&args[0], &option) || - option < 0 || option > NR_UNIX98_PTY_MAX) - return -EINVAL; - opts->max = option; - break; - default: - pr_err("called with bogus options\n"); - return -EINVAL; - } + struct pts_fs_info *fsi = fc->s_fs_info; + struct pts_mount_opts *opts = &fsi->mount_opts; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, devpts_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->uid = result.uid; + opts->setuid = 1; + break; + case Opt_gid: + opts->gid = result.gid; + opts->setgid = 1; + break; + case Opt_mode: + opts->mode = result.uint_32 & S_IALLUGO; + break; + case Opt_ptmxmode: + opts->ptmxmode = result.uint_32 & S_IALLUGO; + break; + case Opt_newinstance: + break; + case Opt_max: + if (result.uint_32 > NR_UNIX98_PTY_MAX) + return invalf(fc, "max out of range"); + opts->max = result.uint_32; + break; } return 0; } -static int mknod_ptmx(struct super_block *sb) +static int mknod_ptmx(struct super_block *sb, struct fs_context *fc) { int mode; int rc = -ENOMEM; @@ -362,13 +318,23 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) } } -static int devpts_remount(struct super_block *sb, int *flags, char *data) +static int devpts_reconfigure(struct fs_context *fc) { - int err; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = &fsi->mount_opts; + struct pts_fs_info *fsi = DEVPTS_SB(fc->root->d_sb); + struct pts_fs_info *new = fc->s_fs_info; - err = parse_mount_options(data, PARSE_REMOUNT, opts); + /* Apply the revised options. We don't want to change ->reserve. + * Ideally, we'd update each option conditionally on it having been + * explicitly changed, but the default is to reset everything so that + * would break UAPI... + */ + fsi->mount_opts.setuid = new->mount_opts.setuid; + fsi->mount_opts.setgid = new->mount_opts.setgid; + fsi->mount_opts.uid = new->mount_opts.uid; + fsi->mount_opts.gid = new->mount_opts.gid; + fsi->mount_opts.mode = new->mount_opts.mode; + fsi->mount_opts.ptmxmode = new->mount_opts.ptmxmode; + fsi->mount_opts.max = new->mount_opts.max; /* * parse_mount_options() restores options to default values @@ -378,7 +344,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) */ update_ptmx_mode(fsi); - return err; + return 0; } static int devpts_show_options(struct seq_file *seq, struct dentry *root) @@ -402,31 +368,13 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations devpts_sops = { .statfs = simple_statfs, - .remount_fs = devpts_remount, .show_options = devpts_show_options, }; -static void *new_pts_fs_info(struct super_block *sb) -{ - struct pts_fs_info *fsi; - - fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); - if (!fsi) - return NULL; - - ida_init(&fsi->allocated_ptys); - fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; - fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; - fsi->sb = sb; - - return fsi; -} - -static int -devpts_fill_super(struct super_block *s, void *data, int silent) +static int devpts_fill_super(struct super_block *s, struct fs_context *fc) { + struct pts_fs_info *fsi = DEVPTS_SB(s); struct inode *inode; - int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; @@ -435,20 +383,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; + fsi->sb = s; - error = -ENOMEM; - s->s_fs_info = new_pts_fs_info(s); - if (!s->s_fs_info) - goto fail; - - error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); - if (error) - goto fail; - - error = -ENOMEM; inode = new_inode(s); if (!inode) - goto fail; + return -ENOMEM; inode->i_ino = 1; simple_inode_init_ts(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; @@ -459,31 +398,60 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_root = d_make_root(inode); if (!s->s_root) { pr_err("get root dentry failed\n"); - goto fail; + return -ENOMEM; } - error = mknod_ptmx(s); - if (error) - goto fail_dput; - - return 0; -fail_dput: - dput(s->s_root); - s->s_root = NULL; -fail: - return error; + return mknod_ptmx(s, fc); } /* - * devpts_mount() + * devpts_get_tree() * * Mount a new (private) instance of devpts. PTYs created in this * instance are independent of the PTYs in other devpts instances. */ -static struct dentry *devpts_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int devpts_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, devpts_fill_super); +} + +static void devpts_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations devpts_context_ops = { + .free = devpts_free_fc, + .parse_param = devpts_parse_param, + .get_tree = devpts_get_tree, + .reconfigure = devpts_reconfigure, +}; + +/* + * Set up the filesystem mount context. + */ +static int devpts_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, devpts_fill_super); + struct pts_fs_info *fsi; + + fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); + if (!fsi) + return -ENOMEM; + + ida_init(&fsi->allocated_ptys); + fsi->mount_opts.uid = GLOBAL_ROOT_UID; + fsi->mount_opts.gid = GLOBAL_ROOT_GID; + fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; + fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->mount_opts.max = NR_UNIX98_PTY_MAX; + + if (fc->purpose == FS_CONTEXT_FOR_MOUNT && + current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns) + fsi->mount_opts.reserve = true; + + fc->s_fs_info = fsi; + fc->ops = &devpts_context_ops; + return 0; } static void devpts_kill_sb(struct super_block *sb) @@ -498,7 +466,8 @@ static void devpts_kill_sb(struct super_block *sb) static struct file_system_type devpts_fs_type = { .name = "devpts", - .mount = devpts_mount, + .init_fs_context = devpts_init_fs_context, + .parameters = devpts_param_specs, .kill_sb = devpts_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index e48c4f9686d3..13a3d0b26194 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -23,7 +23,7 @@ struct dlm_config_node { extern const struct rhashtable_params dlm_rhash_rsb_params; -#define DLM_MAX_ADDR_COUNT 3 +#define DLM_MAX_ADDR_COUNT 8 #define DLM_PROTO_TCP 0 #define DLM_PROTO_SCTP 1 diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index c8ff88f1cdcf..e01d5f29f4d2 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -741,6 +741,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, read_lock_bh(&ls->ls_rsbtbl_lock); if (!rsb_flag(r, RSB_HASHED)) { read_unlock_bh(&ls->ls_rsbtbl_lock); + error = -EBADR; goto do_new; } @@ -784,6 +785,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, } } else { write_unlock_bh(&ls->ls_rsbtbl_lock); + error = -EBADR; goto do_new; } diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 8afac6e2dff0..1929327ffbe1 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -576,7 +576,7 @@ static int new_lockspace(const char *name, const char *cluster, lockspace to start running (via sysfs) in dlm_ls_start(). */ error = do_uevent(ls, 1); - if (error) + if (error < 0) goto out_recoverd; /* wait until recovery is successful or failed */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d28141829c05..70abd4da17a6 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1826,8 +1826,8 @@ static int dlm_tcp_listen_validate(void) { /* We don't support multi-homed hosts */ if (dlm_local_count > 1) { - log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); - return -EINVAL; + log_print("Detect multi-homed hosts but use only the first IP address."); + log_print("Try SCTP, if you want to enable multi-link."); } return 0; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d45ef541d848..019a8b4eaaf9 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -14,7 +14,7 @@ #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ -int sysctl_drop_caches; +static int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb, void *unused) { @@ -48,7 +48,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) iput(toput_inode); } -int drop_caches_sysctl_handler(const struct ctl_table *table, int write, +static int drop_caches_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret; @@ -77,3 +77,22 @@ int drop_caches_sysctl_handler(const struct ctl_table *table, int write, } return 0; } + +static const struct ctl_table drop_caches_table[] = { + { + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = drop_caches_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, + }, +}; + +static int __init init_vm_drop_caches_sysctls(void) +{ + register_sysctl_init("vm", drop_caches_table); + return 0; +} +fs_initcall(init_vm_drop_caches_sysctls); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a9819ddb1ab8..51a5c54eb740 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -503,18 +503,24 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int rc; struct dentry *lower_dentry; struct inode *lower_dir; rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (!rc) - rc = vfs_mkdir(&nop_mnt_idmap, lower_dir, - lower_dentry, mode); - if (rc || d_really_is_negative(lower_dentry)) + if (rc) + goto out; + + lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir, + lower_dentry, mode); + rc = PTR_ERR(lower_dentry); + if (IS_ERR(lower_dentry)) + goto out; + rc = 0; + if (d_unhashed(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) @@ -526,7 +532,7 @@ out: inode_unlock(lower_dir); if (d_really_is_negative(dentry)) d_drop(dentry); - return rc; + return ERR_PTR(rc); } static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 0b1c878317ab..e7b7f426fecf 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -172,7 +172,6 @@ const struct super_operations ecryptfs_sops = { .destroy_inode = ecryptfs_destroy_inode, .free_inode = ecryptfs_free_inode, .statfs = ecryptfs_statfs, - .remount_fs = NULL, .evict_inode = ecryptfs_evict_inode, .show_options = ecryptfs_show_options }; diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 6ea60661fa55..331e49cd1b8d 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -13,12 +13,12 @@ config EROFS_FS smartphones with Android OS, LiveCDs and high-density hosts with numerous containers; - It also provides fixed-sized output compression support in order to - improve storage density as well as keep relatively higher compression - ratios and implements in-place decompression to reuse the file page - for compressed data temporarily with proper strategies, which is - quite useful to ensure guaranteed end-to-end runtime decompression - performance under extremely memory pressure without extra cost. + It also provides transparent compression and deduplication support to + improve storage density and maintain relatively high compression + ratios, and it implements in-place decompression to temporarily reuse + page cache for compressed data using proper strategies, which is + quite useful for ensuring guaranteed end-to-end runtime decompression + performance under extreme memory pressure without extra cost. See the documentation at <file:Documentation/filesystems/erofs.rst> and the web pages at <https://erofs.docs.kernel.org> for more details. @@ -97,7 +97,7 @@ config EROFS_FS_ZIP select LZ4_DECOMPRESS default y help - Enable fixed-sized output compression for EROFS. + Enable transparent compression support for EROFS file systems. If you don't want to enable compression feature, say N. diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 65ff39401020..2704d7a592a5 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,6 +11,7 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; + unsigned int inpages, outpages; unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; @@ -59,7 +60,6 @@ extern const struct z_erofs_decompressor *z_erofs_decomp[]; struct z_erofs_stream_dctx { struct z_erofs_decompress_req *rq; - unsigned int inpages, outpages; /* # of {en,de}coded pages */ int no, ni; /* the current {en,de}coded page # */ unsigned int avail_out; /* remaining bytes in the decoded buffer */ diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 0cd6b5c4df98..2409d2ab0c28 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -25,8 +25,7 @@ void erofs_put_metabuf(struct erofs_buf *buf) buf->page = NULL; } -void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, - enum erofs_kmap_type type) +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap) { pgoff_t index = offset >> PAGE_SHIFT; struct folio *folio = NULL; @@ -43,10 +42,10 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, return folio; } buf->page = folio_file_page(folio, index); - if (!buf->base && type == EROFS_KMAP) - buf->base = kmap_local_page(buf->page); - if (type == EROFS_NO_KMAP) + if (!need_kmap) return NULL; + if (!buf->base) + buf->base = kmap_local_page(buf->page); return buf->base + (offset & ~PAGE_MASK); } @@ -65,64 +64,47 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_off_t offset, enum erofs_kmap_type type) + erofs_off_t offset, bool need_kmap) { erofs_init_metabuf(buf, sb); - return erofs_bread(buf, offset, type); -} - -static int erofs_map_blocks_flatmode(struct inode *inode, - struct erofs_map_blocks *map) -{ - struct erofs_inode *vi = EROFS_I(inode); - struct super_block *sb = inode->i_sb; - bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); - erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking; - - map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */ - if (map->m_la < erofs_pos(sb, lastblk)) { - map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; - map->m_plen = erofs_pos(sb, lastblk) - map->m_la; - } else { - DBG_BUGON(!tailendpacking); - map->m_pa = erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(sb, map->m_la); - map->m_plen = inode->i_size - map->m_la; - - /* inline data should be located in the same meta block */ - if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - map->m_flags |= EROFS_MAP_META; - } - return 0; + return erofs_bread(buf, offset, need_kmap); } int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = inode->i_sb; + unsigned int unit, blksz = sb->s_blocksize; struct erofs_inode *vi = EROFS_I(inode); struct erofs_inode_chunk_index *idx; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u64 chunknr; - unsigned int unit; + erofs_blk_t startblk, addrmask; + bool tailpacking; erofs_off_t pos; - void *kaddr; + u64 chunknr; int err = 0; trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; - if (map->m_la >= inode->i_size) { - /* leave out-of-bound access unmapped */ - map->m_flags = 0; - map->m_plen = map->m_llen; + map->m_flags = 0; + if (map->m_la >= inode->i_size) goto out; - } if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { - err = erofs_map_blocks_flatmode(inode, map); + tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + if (!tailpacking && vi->startblk == EROFS_NULL_ADDR) + goto out; + pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking); + + map->m_flags = EROFS_MAP_MAPPED; + if (map->m_la < pos) { + map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la; + map->m_llen = pos - map->m_la; + } else { + map->m_pa = erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(sb, map->m_la); + map->m_llen = inode->i_size - map->m_la; + map->m_flags |= EROFS_MAP_META; + } goto out; } @@ -135,45 +117,44 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); + idx = erofs_read_metabuf(&buf, sb, pos, true); + if (IS_ERR(idx)) { + err = PTR_ERR(idx); goto out; } map->m_la = chunknr << vi->chunkbits; - map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, - round_up(inode->i_size - map->m_la, sb->s_blocksize)); - - /* handle block map */ - if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = kaddr; - - if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { - map->m_flags = 0; - } else { - map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr)); + map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits, + round_up(inode->i_size - map->m_la, blksz)); + if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) { + addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ? + BIT_ULL(48) - 1 : BIT_ULL(32) - 1; + startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) | + le32_to_cpu(idx->startblk_lo)) & addrmask; + if ((startblk ^ EROFS_NULL_ADDR) & addrmask) { + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; + map->m_pa = erofs_pos(sb, startblk); + map->m_flags = EROFS_MAP_MAPPED; + } + } else { + startblk = le32_to_cpu(*(__le32 *)idx); + if (startblk != (u32)EROFS_NULL_ADDR) { + map->m_pa = erofs_pos(sb, startblk); map->m_flags = EROFS_MAP_MAPPED; } - goto out_unlock; - } - /* parse chunk indexes */ - idx = kaddr; - switch (le32_to_cpu(idx->blkaddr)) { - case EROFS_NULL_ADDR: - map->m_flags = 0; - break; - default: - map->m_deviceid = le16_to_cpu(idx->device_id) & - EROFS_SB(sb)->device_id_mask; - map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr)); - map->m_flags = EROFS_MAP_MAPPED; - break; } -out_unlock: erofs_put_metabuf(&buf); out: - if (!err) - map->m_llen = map->m_plen; + if (!err) { + map->m_plen = map->m_llen; + /* inline data should be located in the same meta block */ + if ((map->m_flags & EROFS_MAP_META) && + erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) { + erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + } trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } @@ -192,7 +173,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; - erofs_off_t startoff, length; + erofs_off_t startoff; int id; erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); @@ -205,7 +186,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return -ENODEV; } if (devs->flatdev) { - map->m_pa += erofs_pos(sb, dif->mapped_blkaddr); + map->m_pa += erofs_pos(sb, dif->uniaddr); up_read(&devs->rwsem); return 0; } @@ -214,13 +195,12 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - if (!dif->mapped_blkaddr) + if (!dif->uniaddr) continue; - startoff = erofs_pos(sb, dif->mapped_blkaddr); - length = erofs_pos(sb, dif->blocks); + startoff = erofs_pos(sb, dif->uniaddr); if (map->m_pa >= startoff && - map->m_pa < startoff + length) { + map->m_pa < startoff + erofs_pos(sb, dif->blocks)) { map->m_pa -= startoff; erofs_fill_from_devinfo(map, sb, dif); break; @@ -312,7 +292,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP); + ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, true); if (IS_ERR(ptr)) return PTR_ERR(ptr); iomap->inline_data = ptr; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2b123b070a42..bf62e2836b60 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -9,14 +9,6 @@ #define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) -struct z_erofs_lz4_decompress_ctx { - struct z_erofs_decompress_req *rq; - /* # of encoded, decoded pages */ - unsigned int inpages, outpages; - /* decoded block total length (used for in-place decompression) */ - unsigned int oend; -}; - static int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size) { @@ -55,10 +47,9 @@ static int z_erofs_load_lz4_config(struct super_block *sb, * Fill all gaps with bounce pages if it's a sparse page list. Also check if * all physical pages are consecutive, which can be seen for moderate CR. */ -static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, struct page **pagepool) { - struct z_erofs_decompress_req *rq = ctx->rq; struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; @@ -68,7 +59,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, unsigned int i, j, top; top = 0; - for (i = j = 0; i < ctx->outpages; ++i, ++j) { + for (i = j = 0; i < rq->outpages; ++i, ++j) { struct page *const page = rq->out[i]; struct page *victim; @@ -114,36 +105,36 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, return kaddr ? 1 : 0; } -static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, +static void *z_erofs_lz4_handle_overlap(struct z_erofs_decompress_req *rq, void *inpage, void *out, unsigned int *inputmargin, int *maptype, bool may_inplace) { - struct z_erofs_decompress_req *rq = ctx->rq; - unsigned int omargin, total, i; + unsigned int oend, omargin, total, i; struct page **in; void *src, *tmp; if (rq->inplace_io) { - omargin = PAGE_ALIGN(ctx->oend) - ctx->oend; + oend = rq->pageofs_out + rq->outputsize; + omargin = PAGE_ALIGN(oend) - oend; if (rq->partial_decoding || !may_inplace || omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < ctx->inpages; ++i) - if (rq->out[ctx->outpages - ctx->inpages + i] != + for (i = 0; i < rq->inpages; ++i) + if (rq->out[rq->outpages - rq->inpages + i] != rq->in[i]) goto docopy; kunmap_local(inpage); *maptype = 3; - return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); + return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT); } - if (ctx->inpages <= 1) { + if (rq->inpages <= 1) { *maptype = 0; return inpage; } kunmap_local(inpage); - src = erofs_vm_map_ram(rq->in, ctx->inpages); + src = erofs_vm_map_ram(rq->in, rq->inpages); if (!src) return ERR_PTR(-ENOMEM); *maptype = 1; @@ -152,7 +143,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, docopy: /* Or copy compressed data which can be overlapped to per-CPU buffer */ in = rq->in; - src = z_erofs_get_gbuf(ctx->inpages); + src = z_erofs_get_gbuf(rq->inpages); if (!src) { DBG_BUGON(1); kunmap_local(inpage); @@ -197,10 +188,8 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, return 0; } -static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, - u8 *dst) +static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst) { - struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; u8 *out, *headpage, *src; @@ -224,7 +213,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } inputmargin = rq->pageofs_in; - src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, + src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); @@ -251,7 +240,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, if (maptype == 0) { kunmap_local(headpage); } else if (maptype == 1) { - vm_unmap_ram(src, ctx->inpages); + vm_unmap_ram(src, rq->inpages); } else if (maptype == 2) { z_erofs_put_gbuf(src); } else if (maptype != 3) { @@ -264,54 +253,42 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool) { - struct z_erofs_lz4_decompress_ctx ctx; unsigned int dst_maptype; void *dst; int ret; - ctx.rq = rq; - ctx.oend = rq->pageofs_out + rq->outputsize; - ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT; - ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - /* one optimized fast path only for non bigpcluster cases yet */ - if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { + if (rq->inpages == 1 && rq->outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); dst = kmap_local_page(*rq->out); dst_maptype = 0; - goto dstmap_out; - } - - /* general decoding path which can be used for all cases */ - ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool); - if (ret < 0) { - return ret; - } else if (ret > 0) { - dst = page_address(*rq->out); - dst_maptype = 1; } else { - dst = erofs_vm_map_ram(rq->out, ctx.outpages); - if (!dst) - return -ENOMEM; - dst_maptype = 2; + /* general decoding path which can be used for all cases */ + ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); + if (ret < 0) + return ret; + if (ret > 0) { + dst = page_address(*rq->out); + dst_maptype = 1; + } else { + dst = erofs_vm_map_ram(rq->out, rq->outpages); + if (!dst) + return -ENOMEM; + dst_maptype = 2; + } } - -dstmap_out: - ret = z_erofs_lz4_decompress_mem(&ctx, dst); + ret = z_erofs_lz4_decompress_mem(rq, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) - vm_unmap_ram(dst, ctx.outpages); + vm_unmap_ram(dst, rq->outpages); return ret; } static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int nrpages_in = - PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; - const unsigned int nrpages_out = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = rq->inpages, nrpages_out = rq->outpages; const unsigned int bs = rq->sb->s_blocksize; unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; u8 *kin; @@ -336,7 +313,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, rq->outputsize -= cur; } - for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { + for (; rq->outputsize; rq->pageofs_in = 0, cur += insz, ni++) { insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); rq->outputsize -= insz; if (!rq->in[ni]) @@ -373,7 +350,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, unsigned int j; if (!dctx->avail_out) { - if (++dctx->no >= dctx->outpages || !rq->outputsize) { + if (++dctx->no >= rq->outpages || !rq->outputsize) { erofs_err(sb, "insufficient space for decompressed data"); return -EFSCORRUPTED; } @@ -401,7 +378,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, } if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) { - if (++dctx->ni >= dctx->inpages) { + if (++dctx->ni >= rq->inpages) { erofs_err(sb, "invalid compressed data"); return -EFSCORRUPTED; } @@ -434,7 +411,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, dctx->bounced = true; } - for (j = dctx->ni + 1; j < dctx->inpages; ++j) { + for (j = dctx->ni + 1; j < rq->inpages; ++j) { if (rq->out[dctx->no] != rq->in[j]) continue; tmppage = erofs_allocpage(pgpl, rq->gfp); diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index 5070d2fcc737..c6908a487054 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -101,13 +101,7 @@ static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, struct page **pgpl) { struct super_block *sb = rq->sb; - struct z_erofs_stream_dctx dctx = { - .rq = rq, - .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, - .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) - >> PAGE_SHIFT, - .no = -1, .ni = 0, - }; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; struct z_erofs_deflate *strm; int zerr, err; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 40666815046f..832cffb83a66 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -150,13 +150,7 @@ static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, struct page **pgpl) { struct super_block *sb = rq->sb; - struct z_erofs_stream_dctx dctx = { - .rq = rq, - .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, - .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) - >> PAGE_SHIFT, - .no = -1, .ni = 0, - }; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; struct xz_buf buf = {}; struct z_erofs_lzma *strm; enum xz_ret xz_err; diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c index 7e177304967e..b4bfe14229f9 100644 --- a/fs/erofs/decompressor_zstd.c +++ b/fs/erofs/decompressor_zstd.c @@ -139,13 +139,7 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, struct page **pgpl) { struct super_block *sb = rq->sb; - struct z_erofs_stream_dctx dctx = { - .rq = rq, - .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, - .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) - >> PAGE_SHIFT, - .no = -1, .ni = 0, - }; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; zstd_in_buffer in_buf = { NULL, 0, 0 }; zstd_out_buffer out_buf = { NULL, 0, 0 }; struct z_erofs_zstd *strm; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index c3b90abdee37..2fae209d0274 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -58,9 +58,9 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, dbstart, EROFS_KMAP); + de = erofs_bread(&buf, dbstart, true); if (IS_ERR(de)) { - erofs_err(sb, "fail to readdir of logical block %u of nid %llu", + erofs_err(sb, "failed to readdir of logical block %llu of nid %llu", erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err = PTR_ERR(de); break; @@ -90,6 +90,11 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) ofs = 0; } erofs_put_metabuf(&buf); + if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) { + if (!dir_emit_dot(f, ctx)) + return 0; + ++ctx->pos; + } return err < 0 ? err : 0; } diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 199395ed1c1f..9581e9bf8192 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -30,25 +30,19 @@ #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 +#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080 #define EROFS_ALL_FEATURE_INCOMPAT \ - (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ - EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ - EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ - EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ - EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ - EROFS_FEATURE_INCOMPAT_DEDUPE | \ - EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES) + ((EROFS_FEATURE_INCOMPAT_48BIT << 1) - 1) #define EROFS_SB_EXTSLOT_SIZE 16 struct erofs_deviceslot { u8 tag[64]; /* digest(sha256), etc. */ - __le32 blocks; /* total fs blocks of this device */ - __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ - u8 reserved[56]; + __le32 blocks_lo; /* total blocks count of this device */ + __le32 uniaddr_lo; /* unified starting block of this device */ + __le32 blocks_hi; /* total blocks count MSB */ + __le16 uniaddr_hi; /* unified starting block MSB */ + u8 reserved[50]; }; #define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) @@ -59,13 +53,14 @@ struct erofs_super_block { __le32 feature_compat; __u8 blkszbits; /* filesystem block size in bit shift */ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ - - __le16 root_nid; /* nid of root directory */ + union { + __le16 rootnid_2b; /* nid of root directory */ + __le16 blocks_hi; /* (48BIT on) blocks count MSB */ + } rb; __le64 inos; /* total valid ino # (== f_files - f_favail) */ - - __le64 build_time; /* compact inode time derivation */ - __le32 build_time_nsec; /* compact inode time derivation in ns scale */ - __le32 blocks; /* used for statfs */ + __le64 epoch; /* base seconds used for compact inodes */ + __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */ + __le32 blocks_lo; /* blocks count LSB */ __le32 meta_blkaddr; /* start block address of metadata area */ __le32 xattr_blkaddr; /* start block address of shared xattr area */ __u8 uuid[16]; /* 128-bit uuid for volume */ @@ -84,7 +79,10 @@ struct erofs_super_block { __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ __u8 xattr_filter_reserved; /* reserved for xattr name filter */ - __u8 reserved2[23]; + __u8 reserved[3]; + __le32 build_time; /* seconds added to epoch for mkfs time */ + __le64 rootnid_8b; /* (48BIT on) nid of root directory */ + __u8 reserved2[8]; }; /* @@ -115,19 +113,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_I_VERSION_MASK 0x01 #define EROFS_I_DATALAYOUT_MASK 0x07 -#define EROFS_I_VERSION_BIT 0 -#define EROFS_I_DATALAYOUT_BIT 1 -#define EROFS_I_ALL_BIT 4 - -#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1) +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */ +#define EROFS_I_DOT_OMITTED_BIT 4 /* (directories) omit the `.` dirent */ +#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1) /* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ #define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F -/* with chunk indexes or just a 4-byte blkaddr array */ +/* with chunk indexes or just a 4-byte block array */ #define EROFS_CHUNK_FORMAT_INDEXES 0x0020 +#define EROFS_CHUNK_FORMAT_48BIT 0x0040 -#define EROFS_CHUNK_FORMAT_ALL \ - (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) +#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1) /* 32-byte on-disk inode */ #define EROFS_INODE_LAYOUT_COMPACT 0 @@ -140,45 +138,40 @@ struct erofs_inode_chunk_info { }; union erofs_inode_i_u { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ + __le32 blocks_lo; /* total blocks count (if compressed inodes) */ + __le32 startblk_lo; /* starting block number (if flat inodes) */ + __le32 rdev; /* device ID (if special inodes) */ struct erofs_inode_chunk_info c; }; +union erofs_inode_i_nb { + __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */ + __le16 blocks_hi; /* total blocks count MSB */ + __le16 startblk_hi; /* starting block number MSB */ +}; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_nlink; + union erofs_inode_i_nb i_nb; __le32 i_size; - __le32 i_reserved; + __le32 i_mtime; union erofs_inode_i_u i_u; __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; __le16 i_gid; - __le32 i_reserved2; + __le32 i_reserved; }; /* 64-byte complete form of an ondisk inode */ struct erofs_inode_extended { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_reserved; + union erofs_inode_i_nb i_nb; __le64 i_size; union erofs_inode_i_u i_u; @@ -248,6 +241,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) if (!i_xattr_icount) return 0; + /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ return sizeof(struct erofs_xattr_ibody_header) + sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); } @@ -266,11 +260,11 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) /* 4-byte block address array */ #define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) -/* 8-byte inode chunk indexes */ +/* 8-byte inode chunk index */ struct erofs_inode_chunk_index { - __le16 advise; /* always 0, don't care for now */ + __le16 startblk_hi; /* starting block number MSB */ __le16 device_id; /* back-end storage id (with bits masked) */ - __le32 blkaddr; /* start block address of this inode chunk */ + __le32 startblk_lo; /* starting block number of this chunk */ }; /* dirent sorts in alphabet order, thus we can do binary search */ @@ -337,21 +331,20 @@ struct z_erofs_zstd_cfgs { #define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE /* - * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) - * e.g. for 4k logical cluster size, 4B if compacted 2B is off; - * (4B) + 2B + (4B) if compacted 2B is on. - * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) - * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) - * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) - * bit 4 : interlaced plain pcluster (0 - off; 1 - on) - * bit 5 : fragment pcluster (0 - off; 1 - on) + * Enable COMPACTED_2B for EROFS_INODE_COMPRESSED_COMPACT inodes: + * 4B (disabled) vs 4B+2B+4B (enabled) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 +/* Enable extent metadata for EROFS_INODE_COMPRESSED_FULL inodes */ +#define Z_EROFS_ADVISE_EXTENTS 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 #define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 #define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 +/* Indicate the record size for each extent if extent metadata is used */ +#define Z_EROFS_ADVISE_EXTRECSZ_BIT 1 +#define Z_EROFS_ADVISE_EXTRECSZ_MASK 0x3 #define Z_EROFS_FRAGMENT_INODE_BIT 7 struct z_erofs_map_header { @@ -363,45 +356,24 @@ struct z_erofs_map_header { /* indicates the encoded size of tailpacking data */ __le16 h_idata_size; }; + __le32 h_extents_lo; /* extent count LSB */ }; __le16 h_advise; - /* - * bit 0-3 : algorithm type of head 1 (logical cluster type 01); - * bit 4-7 : algorithm type of head 2 (logical cluster type 11). - */ - __u8 h_algorithmtype; - /* - * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-6 : reserved; - * bit 7 : move the whole file into packed inode or not. - */ - __u8 h_clusterbits; + union { + struct { + /* algorithm type (bit 0-3: HEAD1; bit 4-7: HEAD2) */ + __u8 h_algorithmtype; + /* + * bit 0-3 : logical cluster bits - blkszbits + * bit 4-6 : reserved + * bit 7 : pack the whole file into packed inode + */ + __u8 h_clusterbits; + }; + __le16 h_extents_hi; /* extent count MSB */ + }; }; -/* - * On-disk logical cluster type: - * 0 - literal (uncompressed) lcluster - * 1,3 - compressed lcluster (for HEAD lclusters) - * 2 - compressed lcluster (for NONHEAD lclusters) - * - * In detail, - * 0 - literal (uncompressed) lcluster, - * di_advise = 0 - * di_clusterofs = the literal data offset of the lcluster - * di_blkaddr = the blkaddr of the literal pcluster - * - * 1,3 - compressed lcluster (for HEAD lclusters) - * di_advise = 1 or 3 - * di_clusterofs = the decompressed data offset of the lcluster - * di_blkaddr = the blkaddr of the compressed pcluster - * - * 2 - compressed lcluster (for NONHEAD lclusters) - * di_advise = 2 - * di_clusterofs = - * the decompressed data offset in its own HEAD lcluster - * di_u.delta[0] = distance to this HEAD lcluster - * di_u.delta[1] = distance to the next HEAD lcluster - */ enum { Z_EROFS_LCLUSTER_TYPE_PLAIN = 0, Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1, @@ -415,11 +387,7 @@ enum { /* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ #define Z_EROFS_LI_PARTIAL_REF (1 << 15) -/* - * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the - * compressed block count of a compressed extent (in logical clusters, aka. - * block count of a pcluster). - */ +/* Set on 1st non-head lcluster to store compressed block counti (in blocks) */ #define Z_EROFS_LI_D0_CBLKCNT (1 << 11) struct z_erofs_lcluster_index { @@ -428,19 +396,36 @@ struct z_erofs_lcluster_index { __le16 di_clusterofs; union { - /* for the HEAD lclusters */ - __le32 blkaddr; + __le32 blkaddr; /* for the HEAD lclusters */ /* - * for the NONHEAD lclusters * [0] - distance to its HEAD lcluster * [1] - distance to the next HEAD lcluster */ - __le16 delta[2]; + __le16 delta[2]; /* for the NONHEAD lclusters */ } di_u; }; -#define Z_EROFS_FULL_INDEX_ALIGN(end) \ - (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8) +#define Z_EROFS_MAP_HEADER_END(end) \ + (ALIGN(end, 8) + sizeof(struct z_erofs_map_header)) +#define Z_EROFS_FULL_INDEX_START(end) (Z_EROFS_MAP_HEADER_END(end) + 8) + +#define Z_EROFS_EXTENT_PLEN_PARTIAL BIT(27) +#define Z_EROFS_EXTENT_PLEN_FMT_BIT 28 +#define Z_EROFS_EXTENT_PLEN_MASK ((Z_EROFS_PCLUSTER_MAX_SIZE << 1) - 1) +struct z_erofs_extent { + __le32 plen; /* encoded length */ + __le32 pstart_lo; /* physical offset */ + __le32 pstart_hi; /* physical offset MSB */ + __le32 lstart_lo; /* logical offset */ + __le32 lstart_hi; /* logical offset MSB (>= 4GiB inodes) */ + __u8 reserved[12]; /* for future use */ +}; + +static inline int z_erofs_extent_recsize(unsigned int advise) +{ + return 4 << ((advise >> Z_EROFS_ADVISE_EXTRECSZ_BIT) & + Z_EROFS_ADVISE_EXTRECSZ_MASK); +} /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 0ffd1c63beeb..bec4b56b3826 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -112,7 +112,7 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) void *src; src = erofs_read_metabuf(&buf, inode->i_sb, - map->m_pa + ofs, EROFS_KMAP); + map->m_pa + ofs, true); if (IS_ERR(src)) { err = PTR_ERR(src); break; diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index ce3d8737df85..9c9129bca346 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -276,7 +276,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) size_t size = map.m_llen; void *src; - src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP); + src = erofs_read_metabuf(&buf, sb, map.m_pa, true); if (IS_ERR(src)) return PTR_ERR(src); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d4b89407822a..a0ae0b4f7b01 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -27,29 +27,27 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, static int erofs_read_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; + erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode)); + unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode)); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_sb_info *sbi = EROFS_SB(sb); + erofs_blk_t addrmask = BIT_ULL(48) - 1; struct erofs_inode *vi = EROFS_I(inode); - const erofs_off_t inode_loc = erofs_iloc(inode); - erofs_blk_t blkaddr, nblks = 0; - void *kaddr; + struct erofs_inode_extended *die, copied; struct erofs_inode_compact *dic; - struct erofs_inode_extended *die, *copied = NULL; - union erofs_inode_i_u iu; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - unsigned int ifmt, ofs; + unsigned int ifmt; + void *ptr; int err = 0; - blkaddr = erofs_blknr(sb, inode_loc); - ofs = erofs_blkoff(sb, inode_loc); - - kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", - vi->nid, PTR_ERR(kaddr)); - return PTR_ERR(kaddr); + ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + erofs_err(sb, "failed to get inode (nid: %llu) page, err %d", + vi->nid, err); + goto err_out; } - dic = kaddr + ofs; + dic = ptr + ofs; ifmt = le16_to_cpu(dic->i_format); if (ifmt & ~EROFS_I_ALL) { erofs_err(sb, "unsupported i_format %u of nid %llu", @@ -73,40 +71,34 @@ static int erofs_read_inode(struct inode *inode) if (ofs + vi->inode_isize <= sb->s_blocksize) { ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; + copied.i_u = die->i_u; + copied.i_nb = die->i_nb; } else { const unsigned int gotten = sb->s_blocksize - ofs; - copied = kmalloc(vi->inode_isize, GFP_KERNEL); - if (!copied) { - err = -ENOMEM; + memcpy(&copied, dic, gotten); + ptr = erofs_read_metabuf(&buf, sb, + erofs_pos(sb, blkaddr + 1), true); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + erofs_err(sb, "failed to get inode payload block (nid: %llu), err %d", + vi->nid, err); goto err_out; } - memcpy(copied, dic, gotten); - kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1), - EROFS_KMAP); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", - vi->nid, PTR_ERR(kaddr)); - kfree(copied); - return PTR_ERR(kaddr); - } ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, kaddr, ofs); - die = copied; + memcpy((u8 *)&copied + gotten, ptr, ofs); + die = &copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); inode->i_mode = le16_to_cpu(die->i_mode); - iu = die->i_u; i_uid_write(inode, le32_to_cpu(die->i_uid)); i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - /* each extended inode has its own timestamp */ - inode_set_ctime(inode, le64_to_cpu(die->i_mtime), + inode_set_mtime(inode, le64_to_cpu(die->i_mtime), le32_to_cpu(die->i_mtime_nsec)); inode->i_size = le64_to_cpu(die->i_size); - kfree(copied); break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); @@ -114,12 +106,20 @@ static int erofs_read_inode(struct inode *inode) vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); - iu = dic->i_u; + copied.i_u = dic->i_u; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); - set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time for compact inodes */ - inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); + if (!S_ISDIR(inode->i_mode) && + ((ifmt >> EROFS_I_NLINK_1_BIT) & 1)) { + set_nlink(inode, 1); + copied.i_nb = dic->i_nb; + } else { + set_nlink(inode, le16_to_cpu(dic->i_nb.nlink)); + copied.i_nb.startblk_hi = 0; + addrmask = BIT_ULL(32) - 1; + } + inode_set_mtime(inode, sbi->epoch + le32_to_cpu(dic->i_mtime), + sbi->fixed_nsec); inode->i_size = le32_to_cpu(dic->i_size); break; @@ -136,19 +136,26 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } switch (inode->i_mode & S_IFMT) { - case S_IFREG: case S_IFDIR: + vi->dot_omitted = (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1; + fallthrough; + case S_IFREG: case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); + vi->startblk = le32_to_cpu(copied.i_u.startblk_lo) | + ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32); + if (vi->datalayout == EROFS_INODE_FLAT_PLAIN && + !((vi->startblk ^ EROFS_NULL_ADDR) & addrmask)) + vi->startblk = EROFS_NULL_ADDR; + if(S_ISLNK(inode->i_mode)) { - err = erofs_fill_symlink(inode, kaddr, ofs); + err = erofs_fill_symlink(inode, ptr, ofs); if (err) goto err_out; } break; case S_IFCHR: case S_IFBLK: - inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev)); + inode->i_rdev = new_decode_dev(le32_to_cpu(copied.i_u.rdev)); break; case S_IFIFO: case S_IFSOCK: @@ -161,12 +168,15 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } - /* total blocks for compressed files */ - if (erofs_inode_is_data_compressed(vi->datalayout)) { - nblks = le32_to_cpu(iu.compressed_blocks); - } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) << + (sb->s_blocksize_bits - 9); + else + inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; + + if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { /* fill chunked inode summary info */ - vi->chunkformat = le16_to_cpu(iu.c.format); + vi->chunkformat = le16_to_cpu(copied.i_u.c.format); if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { erofs_err(sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); @@ -176,22 +186,15 @@ static int erofs_read_inode(struct inode *inode) vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode_set_mtime_to_ts(inode, - inode_set_atime_to_ts(inode, inode_get_ctime(inode))); + inode_set_atime_to_ts(inode, + inode_set_ctime_to_ts(inode, inode_get_mtime(inode))); inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && (vi->datalayout == EROFS_INODE_FLAT_PLAIN || vi->datalayout == EROFS_INODE_CHUNK_BASED)) inode->i_flags |= S_DAX; - - if (!nblks) - /* measure inode.i_blocks as generic filesystems */ - inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; - else - inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); err_out: - DBG_BUGON(err); erofs_put_metabuf(&buf); return err; } @@ -202,13 +205,10 @@ static int erofs_fill_inode(struct inode *inode) int err; trace_erofs_fill_inode(inode); - - /* read inode base data from disk */ err = erofs_read_inode(inode); if (err) return err; - /* setup the new inode */ switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; @@ -229,15 +229,10 @@ static int erofs_fill_inode(struct inode *inode) inode->i_op = &erofs_symlink_iops; inode_nohighmem(inode); break; - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: + default: inode->i_op = &erofs_generic_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); return 0; - default: - return -EFSCORRUPTED; } mapping_set_large_folios(inode->i_mapping); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 686d835eb533..4ac188d5d894 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -37,8 +37,7 @@ __printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...); typedef u64 erofs_nid_t; typedef u64 erofs_off_t; -/* data type for filesystem-wide blocks number */ -typedef u32 erofs_blk_t; +typedef u64 erofs_blk_t; struct erofs_device_info { char *path; @@ -47,8 +46,8 @@ struct erofs_device_info { struct dax_device *dax_dev; u64 dax_part_off; - u32 blocks; - u32 mapped_blkaddr; + erofs_blk_t blocks; + erofs_blk_t uniaddr; }; enum { @@ -143,8 +142,8 @@ struct erofs_sb_info { unsigned char blkszbits; /* filesystem block size in bit shift */ u32 sb_size; /* total superblock size */ - u32 build_time_nsec; - u64 build_time; + u32 fixed_nsec; + s64 epoch; /* what we really care is nid, rather than ino.. */ erofs_nid_t root_nid; @@ -152,8 +151,6 @@ struct erofs_sb_info { /* used for statfs, f_files - f_favail */ u64 inos; - u8 uuid[16]; /* 128-bit uuid for volume */ - u8 volume_name[16]; /* volume name */ u32 feature_compat; u32 feature_incompat; @@ -199,11 +196,6 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -enum erofs_kmap_type { - EROFS_NO_KMAP, /* don't map the buffer */ - EROFS_KMAP, /* use kmap_local_page() to map the buffer */ -}; - struct erofs_buf { struct address_space *mapping; struct file *file; @@ -212,8 +204,8 @@ struct erofs_buf { }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits)) -#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) +#define erofs_blknr(sb, pos) ((erofs_blk_t)((pos) >> (sb)->s_blocksize_bits)) +#define erofs_blkoff(sb, pos) ((pos) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) #define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) @@ -233,6 +225,7 @@ EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) +EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) @@ -252,6 +245,7 @@ struct erofs_inode { unsigned char datalayout; unsigned char inode_isize; + bool dot_omitted; unsigned int xattr_isize; unsigned int xattr_name_filter; @@ -259,7 +253,7 @@ struct erofs_inode { unsigned int *xattr_shared_xattrs; union { - erofs_blk_t raw_blkaddr; + erofs_blk_t startblk; struct { unsigned short chunkformat; unsigned char chunkbits; @@ -268,15 +262,13 @@ struct erofs_inode { struct { unsigned short z_advise; unsigned char z_algorithmtype[2]; - unsigned char z_logical_clusterbits; - unsigned long z_tailextent_headlcn; + unsigned char z_lclusterbits; union { - struct { - erofs_off_t z_idataoff; - unsigned short z_idata_size; - }; - erofs_off_t z_fragmentoff; + u64 z_tailextent_headlcn; + u64 z_extents; }; + erofs_off_t z_fragmentoff; + unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -387,11 +379,10 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, - enum erofs_kmap_type type); +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap); void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_off_t offset, enum erofs_kmap_type type); + erofs_off_t offset, bool need_kmap); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -448,6 +439,7 @@ int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); int __init z_erofs_init_subsystem(void); void z_erofs_exit_subsystem(void); +int z_erofs_init_super(struct super_block *sb); unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr_shrink); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, @@ -457,7 +449,6 @@ void z_erofs_put_gbuf(void *ptr); int z_erofs_gbuf_growsize(unsigned int nrpages); int __init z_erofs_gbuf_init(void); void z_erofs_gbuf_exit(void); -int erofs_init_managed_cache(struct super_block *sb); int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} @@ -466,7 +457,7 @@ static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} static inline int z_erofs_init_subsystem(void) { return 0; } static inline void z_erofs_exit_subsystem(void) {} -static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } +static inline int z_erofs_init_super(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index c94d0c1608a8..f7cf4f41af28 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -100,7 +100,7 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_dirent *de; buf.mapping = dir->i_mapping; - de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP); + de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), true); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 827b62665649..cadec6b1b554 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -94,7 +94,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, int len, i, cnt; *offset = round_up(*offset, 4); - ptr = erofs_bread(buf, *offset, EROFS_KMAP); + ptr = erofs_bread(buf, *offset, true); if (IS_ERR(ptr)) return ptr; @@ -110,7 +110,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, for (i = 0; i < len; i += cnt) { cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), len - i); - ptr = erofs_bread(buf, *offset, EROFS_KMAP); + ptr = erofs_bread(buf, *offset, true); if (IS_ERR(ptr)) { kfree(buffer); return ptr; @@ -141,7 +141,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_deviceslot *dis; struct file *file; - dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); + dis = erofs_read_metabuf(buf, sb, *pos, true); if (IS_ERR(dis)) return PTR_ERR(dis); @@ -178,8 +178,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, dif->file = file; } - dif->blocks = le32_to_cpu(dis->blocks); - dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + dif->blocks = le32_to_cpu(dis->blocks_lo); + dif->uniaddr = le32_to_cpu(dis->uniaddr_lo); sbi->total_blocks += dif->blocks; *pos += EROFS_DEVT_SLOT_SIZE; return 0; @@ -255,7 +255,7 @@ static int erofs_read_superblock(struct super_block *sb) void *data; int ret; - data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP); + data = erofs_read_metabuf(&buf, sb, 0, true); if (IS_ERR(data)) { erofs_err(sb, "cannot read erofs superblock"); return PTR_ERR(data); @@ -268,7 +268,7 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } - sbi->blkszbits = dsb->blkszbits; + sbi->blkszbits = dsb->blkszbits; if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); goto out; @@ -299,7 +299,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->dif0.blocks = le32_to_cpu(dsb->blocks); + sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -308,23 +308,20 @@ static int erofs_read_superblock(struct super_block *sb) sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); - sbi->root_nid = le16_to_cpu(dsb->root_nid); + if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) { + sbi->root_nid = le64_to_cpu(dsb->rootnid_8b); + sbi->dif0.blocks = (sbi->dif0.blocks << 32) | + le16_to_cpu(dsb->rb.blocks_hi); + } else { + sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b); + } sbi->packed_nid = le64_to_cpu(dsb->packed_nid); sbi->inos = le64_to_cpu(dsb->inos); - sbi->build_time = le64_to_cpu(dsb->build_time); - sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); - + sbi->epoch = (s64)le64_to_cpu(dsb->epoch); + sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); - ret = strscpy(sbi->volume_name, dsb->volume_name, - sizeof(dsb->volume_name)); - if (ret < 0) { /* -E2BIG */ - erofs_err(sb, "bad volume name without NIL terminator"); - ret = -EFSCORRUPTED; - goto out; - } - /* parse on-disk compression configurations */ ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) @@ -333,6 +330,8 @@ static int erofs_read_superblock(struct super_block *sb) /* handle multiple devices */ ret = erofs_scan_devices(sb, dsb); + if (erofs_sb_has_48bit(sbi)) + erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: @@ -639,9 +638,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) else sb->s_flags &= ~SB_POSIXACL; -#ifdef CONFIG_EROFS_FS_ZIP - xa_init(&sbi->managed_pslots); -#endif + err = z_erofs_init_super(sb); + if (err) + return err; + + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + inode = erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->packed_inode = inode; + } inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) @@ -653,24 +659,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) iput(inode); return -EINVAL; } - sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; erofs_shrinker_register(sb); - if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { - sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); - if (IS_ERR(sbi->packed_inode)) { - err = PTR_ERR(sbi->packed_inode); - sbi->packed_inode = NULL; - return err; - } - } - err = erofs_init_managed_cache(sb); - if (err) - return err; - err = erofs_xattr_prefixes_init(sb); if (err) return err; @@ -806,6 +799,16 @@ static int erofs_init_fs_context(struct fs_context *fc) return 0; } +static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi) +{ + iput(sbi->packed_inode); + sbi->packed_inode = NULL; +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif +} + static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -815,6 +818,7 @@ static void erofs_kill_sb(struct super_block *sb) kill_anon_super(sb); else kill_block_super(sb); + erofs_drop_internal_inodes(sbi); fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); @@ -825,17 +829,10 @@ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); - DBG_BUGON(!sbi); - erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); erofs_xattr_prefixes_cleanup(sb); -#ifdef CONFIG_EROFS_FS_ZIP - iput(sbi->managed_cache); - sbi->managed_cache = NULL; -#endif - iput(sbi->packed_inode); - sbi->packed_inode = NULL; + erofs_drop_internal_inodes(sbi); erofs_free_dev_context(sbi->devs); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 19d586273b70..dad4e6c6c155 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -81,6 +81,7 @@ EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); EROFS_ATTR_FEATURE(dedupe); +EROFS_ATTR_FEATURE(48bit); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -93,6 +94,7 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(ztailpacking), ATTR_LIST(fragments), ATTR_LIST(dedupe), + ATTR_LIST(48bit), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index df2777e05661..9cf84717a92e 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -81,7 +81,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, true); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; @@ -102,7 +102,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, true); if (IS_ERR(it.kaddr)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; @@ -183,7 +183,7 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, void *src; for (processed = 0; processed < len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -286,7 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) /* 2. handle xattr name */ for (processed = 0; processed < entry.e_name_len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -330,7 +330,7 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; while (remaining) { - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -367,7 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, for (i = 0; i < vi->xattr_shared_count; ++i) { it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + vi->xattr_shared_xattrs[i] * sizeof(__le32); - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d771e06db738..0671184d9cf1 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -44,8 +44,8 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ struct z_erofs_pcluster *next; - /* I: start block address of this pcluster */ - erofs_off_t index; + /* I: start physical position of this pcluster */ + erofs_off_t pos; /* L: the maximum decompression size of this round */ unsigned int length; @@ -73,6 +73,9 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; + /* I: whether compressed data is in-lined or not */ + bool from_meta; + /* L: whether partial decompression or not */ bool partial; @@ -102,14 +105,9 @@ struct z_erofs_decompressqueue { bool eio, sync; }; -static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) -{ - return !pcl->index; -} - static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { - return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; + return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT; } static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) @@ -133,7 +131,7 @@ struct z_erofs_pcluster_slab { static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), - _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) + _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1) }; struct z_erofs_bvec_iter { @@ -267,7 +265,6 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclustersize = size; return pcl; } return ERR_PTR(-EINVAL); @@ -516,6 +513,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) struct z_erofs_pcluster *pcl = fe->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); + pgoff_t poff = pcl->pos >> PAGE_SHIFT; bool may_bypass = true; /* Optimistic allocation, as in-place I/O can be used as a fallback */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | @@ -532,7 +530,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - folio = filemap_get_folio(mc, pcl->index + i); + folio = filemap_get_folio(mc, poff + i); if (IS_ERR(folio)) { may_bypass = false; if (!shouldalloc) @@ -575,7 +573,7 @@ static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, struct folio *folio; int i; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + DBG_BUGON(pcl->from_meta); /* Each cached folio contains one page unless bs > ps is supported */ for (i = 0; i < pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page) { @@ -607,7 +605,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) ret = false; spin_lock(&pcl->lockref.lock); if (pcl->lockref.count <= 0) { - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + DBG_BUGON(pcl->from_meta); for (; bvec < end; ++bvec) { if (bvec->page && page_folio(bvec->page) == folio) { bvec->page = NULL; @@ -644,18 +642,18 @@ static const struct address_space_operations z_erofs_cache_aops = { .invalidate_folio = z_erofs_cache_invalidate_folio, }; -int erofs_init_managed_cache(struct super_block *sb) +int z_erofs_init_super(struct super_block *sb) { struct inode *const inode = new_inode(sb); if (!inode) return -ENOMEM; - set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; + xa_init(&EROFS_SB(sb)->managed_pslots); return 0; } @@ -667,16 +665,20 @@ static int z_erofs_attach_page(struct z_erofs_frontend *fe, int ret; if (exclusive) { - /* give priority for inplaceio to use file pages first */ - spin_lock(&pcl->lockref.lock); - while (fe->icur > 0) { - if (pcl->compressed_bvecs[--fe->icur].page) - continue; - pcl->compressed_bvecs[fe->icur] = *bvec; + /* Inplace I/O is limited to one page for uncompressed data */ + if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX || + fe->icur <= 1) { + /* Try to prioritize inplace I/O here */ + spin_lock(&pcl->lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->lockref.lock); + return 0; + } spin_unlock(&pcl->lockref.lock); - return 0; } - spin_unlock(&pcl->lockref.lock); /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && @@ -711,27 +713,26 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl, *pre; + unsigned int pageofs_in; int err; - if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(map->m_plen); + pageofs_in = erofs_blkoff(sb, map->m_pa); + pcl = z_erofs_alloc_pcluster(pageofs_in + map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; + pcl->pclustersize = map->m_plen; + pcl->pageofs_in = pageofs_in; pcl->length = 0; pcl->partial = true; pcl->next = fe->head; + pcl->pos = map->m_pa; + pcl->pageofs_in = pageofs_in; pcl->pageofs_out = map->m_la & ~PAGE_MASK; + pcl->from_meta = map->m_flags & EROFS_MAP_META; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* @@ -741,13 +742,10 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) mutex_init(&pcl->lock); DBG_BUGON(!mutex_trylock(&pcl->lock)); - if (ztailpacking) { - pcl->index = 0; /* which indicates ztailpacking */ - } else { - pcl->index = erofs_blknr(sb, map->m_pa); + if (!pcl->from_meta) { while (1) { xa_lock(&sbi->managed_pslots); - pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index, + pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->pos, NULL, pcl, GFP_KERNEL); if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { xa_unlock(&sbi->managed_pslots); @@ -779,7 +777,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; - erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct z_erofs_pcluster *pcl = NULL; int ret; @@ -790,9 +787,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) if (!(map->m_flags & EROFS_MAP_META)) { while (1) { rcu_read_lock(); - pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr); + pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa); if (!pcl || z_erofs_get_pcluster(pcl)) { - DBG_BUGON(pcl && blknr != pcl->index); + DBG_BUGON(pcl && map->m_pa != pcl->pos); rcu_read_unlock(); break; } @@ -826,13 +823,13 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - if (!z_erofs_is_inline_pcluster(fe->pcl)) { + if (!fe->pcl->from_meta) { /* bind cache first when cached decompression is preferred */ z_erofs_bind_cache(fe); } else { void *mptr; - mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP); + mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, false); if (IS_ERR(mptr)) { ret = PTR_ERR(mptr); erofs_err(sb, "failed to get inline data %d", ret); @@ -871,7 +868,7 @@ static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, * It's impossible to fail after the pcluster is freezed, but in order * to avoid some race conditions, add a DBG_BUGON to observe this. */ - DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl); + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) != pcl); lockref_mark_dead(&pcl->lockref); return true; @@ -967,7 +964,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio, buf.mapping = packed_inode->i_mapping; for (; cur < end; cur += cnt, pos += cnt) { cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); - src = erofs_bread(&buf, pos, EROFS_KMAP); + src = erofs_bread(&buf, pos, true); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); @@ -1221,7 +1218,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped) } be->compressed_pages[i] = page; - if (z_erofs_is_inline_pcluster(pcl) || + if (pcl->from_meta || erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) { if (!PageUptodate(page)) err = -EIO; @@ -1284,6 +1281,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, + .inpages = pclusterpages, + .outpages = be->nr_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = pcl->pclustersize, @@ -1297,7 +1296,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) }, be->pagepool); /* must handle all compressed pages before actual file pages */ - if (z_erofs_is_inline_pcluster(pcl)) { + if (pcl->from_meta) { page = pcl->compressed_bvecs[0].page; WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); @@ -1357,7 +1356,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) WRITE_ONCE(pcl->next, NULL); mutex_unlock(&pcl->lock); - if (z_erofs_is_inline_pcluster(pcl)) + if (pcl->from_meta) z_erofs_free_pcluster(pcl); else z_erofs_put_pcluster(sbi, pcl, try_free); @@ -1538,7 +1537,7 @@ out_allocfolio: folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->index + nr, gfp)) { + filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) { /* turn into a temporary shortlived folio (1 ref) */ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; @@ -1655,19 +1654,20 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f, pcl = next; next = READ_ONCE(pcl->next); - if (z_erofs_is_inline_pcluster(pcl)) { + if (pcl->from_meta) { z_erofs_move_to_bypass_queue(pcl, next, qtail); continue; } /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = erofs_pos(sb, pcl->index), + .m_pa = round_down(pcl->pos, sb->s_blocksize), }; (void)erofs_map_dev(sb, &mdev); cur = mdev.m_pa; - end = cur + pcl->pclustersize; + end = round_up(cur + pcl->pageofs_in + pcl->pclustersize, + sb->s_blocksize); do { bvec.bv_page = NULL; if (bio && (cur != last_pa || diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 689437e99a5a..8de50df05dfe 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -25,13 +25,13 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) + + const erofs_off_t pos = Z_EROFS_FULL_INDEX_START(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_lcluster_index); struct z_erofs_lcluster_index *di; unsigned int advise; - di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, true); if (IS_ERR(di)) return PTR_ERR(di); m->lcn = lcn; @@ -40,7 +40,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, advise = le16_to_cpu(di->di_advise); m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { - m->clusterofs = 1 << vi->z_logical_clusterbits; + m->clusterofs = 1 << vi->z_lclusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | @@ -55,7 +55,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, } else { m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs = le16_to_cpu(di->di_clusterofs); - if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { + if (m->clusterofs >= 1 << vi->z_lclusterbits) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -102,9 +102,9 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + - ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - const unsigned int lclusterbits = vi->z_logical_clusterbits; + const erofs_off_t ebase = Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize); + const unsigned int lclusterbits = vi->z_lclusterbits; const unsigned int totalidx = erofs_iblks(inode); unsigned int compacted_4b_initial, compacted_2b, amortizedshift; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; @@ -146,7 +146,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; - in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); + in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, true); if (IS_ERR(in)) return PTR_ERR(in); @@ -255,7 +255,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, { struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); - const unsigned int lclusterbits = vi->z_logical_clusterbits; + const unsigned int lclusterbits = vi->z_lclusterbits; while (m->lcn >= lookback_distance) { unsigned long lcn = m->lcn - lookback_distance; @@ -265,26 +265,22 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, if (err) return err; - switch (m->type) { - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) { + erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { lookback_distance = m->delta[0]; if (!lookback_distance) - goto err_bogus; + break; continue; - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: + } else { m->headtype = m->type; m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; - default: - erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", - m->type, lcn, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; } } -err_bogus: erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); @@ -308,7 +304,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) || ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) || - (lcn << vi->z_logical_clusterbits) >= inode->i_size) + (lcn << vi->z_lclusterbits) >= inode->i_size) m->compressedblks = 1; if (m->compressedblks) @@ -329,35 +325,28 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, DBG_BUGON(lcn == initial_lcn && m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); - switch (m->type) { - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + if (m->delta[0] != 1) { + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + if (m->compressedblks) + goto out; + } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 block-sized pcluster. */ m->compressedblks = 1; - break; - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: - if (m->delta[0] != 1) - goto err_bonus_cblkcnt; - if (m->compressedblks) - break; - fallthrough; - default: - erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; + goto out; } + erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; out: m->map->m_plen = erofs_pos(sb, m->compressedblks); return 0; -err_bonus_cblkcnt: - erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; } static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) @@ -365,7 +354,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) struct inode *inode = m->inode; struct erofs_inode *vi = EROFS_I(inode); struct erofs_map_blocks *map = m->map; - unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned int lclusterbits = vi->z_lclusterbits; u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; int err; @@ -386,9 +375,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) m->delta[1] = 1; DBG_BUGON(1); } - } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || - m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { if (lcn != headlcn) break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; @@ -404,23 +391,32 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return 0; } -static int z_erofs_do_map_blocks(struct inode *inode, +static int z_erofs_map_blocks_fo(struct inode *inode, struct erofs_map_blocks *map, int flags) { - struct erofs_inode *const vi = EROFS_I(inode); - bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + bool ztailpacking = vi->z_idata_size; + unsigned int lclusterbits = vi->z_lclusterbits; struct z_erofs_maprecorder m = { .inode = inode, .map = map, }; int err = 0; - unsigned int lclusterbits, endoff, afmt; + unsigned int endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; - lclusterbits = vi->z_logical_clusterbits; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; + if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; + return 0; + } initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); @@ -428,9 +424,8 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (err) goto unmap_out; - if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) - vi->z_idataoff = m.nextpackoff; - + if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking) + vi->z_fragmentoff = m.nextpackoff; map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; @@ -452,8 +447,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, } /* m.lcn should be >= 1 if endoff < m.clusterofs */ if (!m.lcn) { - erofs_err(inode->i_sb, - "invalid logical cluster 0 at nid %llu", + erofs_err(sb, "invalid logical cluster 0 at nid %llu", vi->nid); err = -EFSCORRUPTED; goto unmap_out; @@ -469,8 +463,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, goto unmap_out; break; default: - erofs_err(inode->i_sb, - "unknown type %u @ offset %llu of nid %llu", + erofs_err(sb, "unknown type %u @ offset %llu of nid %llu", m.type, ofs, vi->nid); err = -EOPNOTSUPP; goto unmap_out; @@ -487,12 +480,18 @@ static int z_erofs_do_map_blocks(struct inode *inode, } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; - map->m_pa = vi->z_idataoff; + map->m_pa = vi->z_fragmentoff; map->m_plen = vi->z_idata_size; + if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map->m_plen); + err = -EFSCORRUPTED; + goto unmap_out; + } } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_FRAGMENT; } else { - map->m_pa = erofs_pos(inode->i_sb, m.pblk); + map->m_pa = erofs_pos(sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto unmap_out; @@ -511,7 +510,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { - erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + erofs_err(sb, "inconsistent algorithmtype %u for nid %llu", afmt, vi->nid); err = -EFSCORRUPTED; goto unmap_out; @@ -535,6 +534,115 @@ unmap_out: return err; } +static int z_erofs_map_blocks_ext(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; + bool interlaced = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER; + unsigned int recsz = z_erofs_extent_recsize(vi->z_advise); + erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize), recsz); + erofs_off_t lend = inode->i_size; + erofs_off_t l, r, mid, pa, la, lstart; + struct z_erofs_extent *ext; + unsigned int fmt; + bool last; + + map->m_flags = 0; + if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) { + if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) { + ext = erofs_read_metabuf(&map->buf, sb, pos, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + pa = le64_to_cpu(*(__le64 *)ext); + pos += sizeof(__le64); + lstart = 0; + } else { + lstart = map->m_la >> vi->z_lclusterbits; + pa = EROFS_NULL_ADDR; + } + + for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) { + ext = erofs_read_metabuf(&map->buf, sb, pos, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + map->m_plen = le32_to_cpu(ext->plen); + if (pa != EROFS_NULL_ADDR) { + map->m_pa = pa; + pa += map->m_plen & Z_EROFS_EXTENT_PLEN_MASK; + } else { + map->m_pa = le32_to_cpu(ext->pstart_lo); + } + pos += recsz; + } + last = (lstart >= round_up(lend, 1 << vi->z_lclusterbits)); + lend = min(lstart, lend); + lstart -= 1 << vi->z_lclusterbits; + } else { + lstart = lend; + for (l = 0, r = vi->z_extents; l < r; ) { + mid = l + (r - l) / 2; + ext = erofs_read_metabuf(&map->buf, sb, + pos + mid * recsz, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + + la = le32_to_cpu(ext->lstart_lo); + pa = le32_to_cpu(ext->pstart_lo) | + (u64)le32_to_cpu(ext->pstart_hi) << 32; + if (recsz > offsetof(struct z_erofs_extent, lstart_hi)) + la |= (u64)le32_to_cpu(ext->lstart_hi) << 32; + + if (la > map->m_la) { + r = mid; + lend = la; + } else { + l = mid + 1; + if (map->m_la == la) + r = min(l + 1, r); + lstart = la; + map->m_plen = le32_to_cpu(ext->plen); + map->m_pa = pa; + } + } + last = (l >= vi->z_extents); + } + + if (lstart < lend) { + map->m_la = lstart; + if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { + map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT; + vi->z_fragmentoff = map->m_plen; + if (recsz >= offsetof(struct z_erofs_extent, pstart_lo)) + vi->z_fragmentoff |= map->m_pa << 32; + } else if (map->m_plen) { + map->m_flags |= EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED; + fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT; + if (fmt) + map->m_algorithmformat = fmt - 1; + else if (interlaced && !erofs_blkoff(sb, map->m_pa)) + map->m_algorithmformat = + Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = + Z_EROFS_COMPRESSION_SHIFTED; + if (map->m_plen & Z_EROFS_EXTENT_PLEN_PARTIAL) + map->m_flags |= EROFS_MAP_PARTIAL_REF; + map->m_plen &= Z_EROFS_EXTENT_PLEN_MASK; + } + } + map->m_llen = lend - map->m_la; + if (!last && map->m_llen < sb->s_blocksize) { + erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu", + map->m_llen, map->m_la, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + return 0; +} + static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -561,7 +669,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); + h = erofs_read_metabuf(&buf, sb, pos, true); if (IS_ERR(h)) { err = PTR_ERR(h); goto out_unlock; @@ -578,8 +686,20 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto done; } vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_lclusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 15); + if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) { + vi->z_extents = le32_to_cpu(h->h_extents_lo) | + ((u64)le16_to_cpu(h->h_extents_hi) << 32); + goto done; + } + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) + vi->z_idata_size = le16_to_cpu(h->h_idata_size); headnr = 0; if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || @@ -590,7 +710,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } - vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7); if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -608,34 +727,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } - if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + if (vi->z_idata_size || + (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER }; - vi->z_idata_size = le16_to_cpu(h->h_idata_size); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - - if (!map.m_plen || - erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map.m_plen); - err = -EFSCORRUPTED; - } - if (err < 0) - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && - !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); - err = z_erofs_do_map_blocks(inode, &map, + err = z_erofs_map_blocks_fo(inode, &map, EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (err < 0) @@ -666,15 +764,11 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, } else { err = z_erofs_fill_inode_lazy(inode); if (!err) { - if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && - !vi->z_tailextent_headlcn) { - map->m_la = 0; - map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | - EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; - } else { - err = z_erofs_do_map_blocks(inode, map, flags); - } + if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) + err = z_erofs_map_blocks_ext(inode, map, flags); + else + err = z_erofs_map_blocks_fo(inode, map, flags); } if (!err && (map->m_flags & EROFS_MAP_ENCODED) && unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || diff --git a/fs/eventfd.c b/fs/eventfd.c index 76129bfcd663..af42b2c7d235 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -406,14 +406,13 @@ static int do_eventfd(unsigned int count, int flags) if (fd < 0) goto err; - file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags); + file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, + ctx, flags, FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(fd); fd = PTR_ERR(file); goto err; } - - file->f_mode |= FMODE_NOWAIT; fd_install(fd, file); return fd; err: diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7c0980db77b3..100376863a44 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -438,7 +438,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) * * we must do our busy polling with irqs enabled */ -static bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static bool ep_busy_loop(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); @@ -447,8 +447,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) if (!budget) budget = BUSY_POLL_BUDGET; - if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { + napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; @@ -492,7 +492,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * or * Nothing to do if we already have this ID */ - if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ @@ -546,7 +546,7 @@ static void ep_suspend_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_suspend_irqs(napi_id); } @@ -554,13 +554,13 @@ static void ep_resume_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_resume_irqs(napi_id); } #else -static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static inline bool ep_busy_loop(struct eventpoll *ep) { return false; } @@ -1980,6 +1980,22 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, return ret; } +static int ep_try_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + int res; + + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of + * more luck. + */ + res = ep_send_events(ep, events, maxevents); + if (res > 0) + ep_suspend_napi_irqs(ep); + return res; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -2031,23 +2047,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, while (1) { if (eavail) { - /* - * Try to transfer events to user space. In case we get - * 0 events and there's still timeout left over, we go - * trying again in search of more luck. - */ - res = ep_send_events(ep, events, maxevents); - if (res) { - if (res > 0) - ep_suspend_napi_irqs(ep); + res = ep_try_send_events(ep, events, maxevents); + if (res) return res; - } } if (timed_out) return 0; - eavail = ep_busy_loop(ep, timed_out); + eavail = ep_busy_loop(ep); if (eavail) continue; @@ -2445,6 +2453,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, return do_epoll_ctl(epfd, op, fd, &epds, false); } +static int ep_check_params(struct file *file, struct epoll_event __user *evs, + int maxevents) +{ + /* The maximum number of event must be greater than zero */ + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) + return -EINVAL; + + /* Verify that the area passed by the user is writeable */ + if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) + return -EFAULT; + + /* + * We have to check that the file structure underneath the fd + * the user passed to us _is_ an eventpoll file. + */ + if (!is_file_epoll(file)) + return -EINVAL; + + return 0; +} + +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents) +{ + struct eventpoll *ep; + int ret; + + ret = ep_check_params(file, events, maxevents); + if (unlikely(ret)) + return ret; + + ep = file->private_data; + /* + * Racy call, but that's ok - it should get retried based on + * poll readiness anyway. + */ + if (ep_events_available(ep)) + return ep_try_send_events(ep, events, maxevents); + return 0; +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). @@ -2453,26 +2502,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; - - /* The maximum number of event must be greater than zero */ - if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) - return -EINVAL; - - /* Verify that the area passed by the user is writeable */ - if (!access_ok(events, maxevents * sizeof(struct epoll_event))) - return -EFAULT; + int ret; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; - /* - * We have to check that the file structure underneath the fd - * the user passed to us _is_ an eventpoll file. - */ - if (!is_file_epoll(fd_file(f))) - return -EINVAL; + ret = ep_check_params(fd_file(f), events, maxevents); + if (unlikely(ret)) + return ret; /* * At this point it is safe to assume that the "private_data" contains diff --git a/fs/exec.c b/fs/exec.c index 506cd411f4ac..f45859ad13ac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -755,8 +755,6 @@ int setup_arg_pages(struct linux_binprm *bprm, mm->arg_start = bprm->p; #endif - if (bprm->loader) - bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 8b30027d8251..fede0283d6e2 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -840,8 +840,8 @@ unlock: return err; } -static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -851,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, loff_t size = i_size_read(dir); if (unlikely(exfat_forced_shutdown(sb))) - return -EIO; + return ERR_PTR(-EIO); mutex_lock(&EXFAT_SB(sb)->s_lock); exfat_set_volume_dirty(sb); @@ -882,7 +882,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, unlock: mutex_unlock(&EXFAT_SB(sb)->s_lock); - return err; + return ERR_PTR(err); } static int exfat_check_dir_empty(struct super_block *sb, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 0c899cfba578..b5845c4846b8 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -126,10 +126,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, int err; parent = ERR_PTR(-EACCES); - inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); - inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("get_parent of %lu failed, err %ld\n", diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 8346ab9534c1..bde617a66cec 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -225,15 +225,16 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct mnt_idmap * idmap, - struct inode * dir, struct dentry * dentry, umode_t mode) +static struct dentry *ext2_mkdir(struct mnt_idmap * idmap, + struct inode * dir, struct dentry * dentry, + umode_t mode) { struct inode * inode; int err; err = dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); inode_inc_link_count(dir); @@ -258,7 +259,7 @@ static int ext2_mkdir(struct mnt_idmap * idmap, d_instantiate_new(dentry, inode); out: - return err; + return ERR_PTR(err); out_fail: inode_dec_link_count(inode); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8042ad873808..c48fd36b2d74 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -649,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 2a135075468d..a4dbaccee6e7 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -25,7 +25,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; sz = EXT4_INODES_PER_GROUP(sb) >> 3; @@ -48,7 +48,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; sz = EXT4_INODES_PER_GROUP(sb) >> 3; @@ -67,7 +67,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); @@ -89,7 +89,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 02d47a64e8d1..d4164c507a90 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -86,7 +86,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); - bool has_csum = ext4_has_metadata_csum(dir->i_sb); + bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; @@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else if (unlikely(next_offset == size && de->name_len == 1 && + de->name[0] == '.')) + error_msg = "'.' directory cannot be the last in data block"; else return 0; @@ -145,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; /* Can we just clear INDEX flag to ignore htree information? */ - if (!ext4_has_metadata_csum(sb)) { + if (!ext4_has_feature_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4e7de7eaa374..5a20e9cd7184 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -278,7 +278,10 @@ struct ext4_system_blocks { /* * Flags for ext4_io_end->flags */ -#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 + +#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ @@ -367,6 +370,8 @@ struct ext4_io_submit { #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) +#define EXT4_B_TO_LBLK(inode, offset) \ + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) @@ -1058,7 +1063,8 @@ struct ext4_inode_info { /* Number of ongoing updates on this inode */ atomic_t i_fc_updates; - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; @@ -1097,8 +1103,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; - spinlock_t i_raw_lock; /* protects updates to the raw inode */ - /* * File creation time. Its function is same as that of * struct timespec64 i_{a,c,m}time in the generic inode. @@ -1141,6 +1145,7 @@ struct ext4_inode_info { /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif + spinlock_t i_block_reservation_lock; /* Lock protecting lists below */ spinlock_t i_completed_io_lock; @@ -1151,8 +1156,6 @@ struct ext4_inode_info { struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; - spinlock_t i_block_reservation_lock; - /* * Transactions that contain inode's metadata needed to complete * fsync and fdatasync, respectively. @@ -1606,6 +1609,8 @@ struct ext4_sb_info { unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; + unsigned int s_sb_update_sec; + unsigned int s_sb_update_kb; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1821,7 +1826,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) @@ -2232,15 +2238,32 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); /* * Superblock flags */ -#define EXT4_FLAGS_RESIZING 0 -#define EXT4_FLAGS_SHUTDOWN 1 -#define EXT4_FLAGS_BDEV_IS_DAX 2 +enum { + EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ + EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ + EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ + EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ +}; static inline int ext4_forced_shutdown(struct super_block *sb) { return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } +static inline int ext4_emergency_ro(struct super_block *sb) +{ + return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); +} + +static inline int ext4_emergency_state(struct super_block *sb) +{ + if (unlikely(ext4_forced_shutdown(sb))) + return -EIO; + if (unlikely(ext4_emergency_ro(sb))) + return -EROFS; + return 0; +} + /* * Default values for user and/or group using reserved blocks */ @@ -2278,6 +2301,13 @@ static inline int ext4_forced_shutdown(struct super_block *sb) #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ /* + * Default values for superblock update + */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ + + +/* * Minimum number of groups in a flexgroup before we separate out * directories into the first block group of a flexgroup */ @@ -2810,8 +2840,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); -extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); @@ -3001,6 +3030,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); @@ -3259,14 +3290,10 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); -static inline int ext4_has_metadata_csum(struct super_block *sb) -{ - return ext4_has_feature_metadata_csum(sb); -} - static inline int ext4_has_group_desc_csum(struct super_block *sb) { - return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); + return ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb); } #define ext4_read_incompat_64bit_val(es, name) \ @@ -3546,11 +3573,11 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); -extern int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop, - void **fsdata); +extern int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); @@ -3785,34 +3812,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -/* For ioend & aio unwritten conversion wait queues */ -#define EXT4_WQ_HASH_SZ 37 -#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) -extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - extern int ext4_resize_begin(struct super_block *sb); extern int ext4_resize_end(struct super_block *sb, bool update_backups); -static inline void ext4_set_io_unwritten_flag(struct inode *inode, - struct ext4_io_end *io_end) +static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_unwritten); - } } static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { - struct inode *inode = io_end->inode; - - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) io_end->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - } } extern const struct iomap_ops ext4_iomap_ops; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index da4a82456383..135e278c832e 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -63,12 +63,14 @@ static void ext4_put_nojournal(handle_t *handle) */ static int ext4_journal_check_start(struct super_block *sb) { + int ret; journal_t *journal; might_sleep(); - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; @@ -244,7 +246,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, } } else ext4_check_bdev_write_error(sb); - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, @@ -331,7 +334,8 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line, err); return err; } - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0c77697d5e90..3221714d9901 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,90 +122,6 @@ #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 -/** - * struct ext4_journal_cb_entry - Base structure for callback information. - * - * This struct is a 'seed' structure for a using with your own callback - * structs. If you are using callbacks you must allocate one of these - * or another struct of your own definition which has this struct - * as it's first element and pass it to ext4_journal_callback_add(). - */ -struct ext4_journal_cb_entry { - /* list information for other callbacks attached to the same handle */ - struct list_head jce_list; - - /* Function to call with this callback structure */ - void (*jce_func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int error); - - /* user data goes here */ -}; - -/** - * ext4_journal_callback_add: add a function to call after transaction commit - * @handle: active journal transaction handle to register callback on - * @func: callback function to call after the transaction has committed: - * @sb: superblock of current filesystem for transaction - * @jce: returned journal callback data - * @rc: journal state at commit (0 = transaction committed properly) - * @jce: journal callback data (internal and function private data struct) - * - * The registered function will be called in the context of the journal thread - * after the transaction for which the handle was created has completed. - * - * No locks are held when the callback function is called, so it is safe to - * call blocking functions from within the callback, but the callback should - * not block or run for too long, or the filesystem will be blocked waiting for - * the next transaction to commit. No journaling functions can be used, or - * there is a risk of deadlock. - * - * There is no guaranteed calling order of multiple registered callbacks on - * the same transaction. - */ -static inline void _ext4_journal_callback_add(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - /* Add the jce to transaction's private list */ - list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); -} - -static inline void ext4_journal_callback_add(handle_t *handle, - void (*func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc), - struct ext4_journal_cb_entry *jce) -{ - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - /* Add the jce to transaction's private list */ - jce->jce_func = func; - spin_lock(&sbi->s_md_lock); - _ext4_journal_callback_add(handle, jce); - spin_unlock(&sbi->s_md_lock); -} - - -/** - * ext4_journal_callback_del: delete a registered callback - * @handle: active journal transaction handle on which callback was registered - * @jce: registered journal callback entry to unregister - * Return true if object was successfully removed - */ -static inline bool ext4_journal_callback_try_del(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - bool deleted; - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - spin_lock(&sbi->s_md_lock); - deleted = !list_empty(&jce->jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - return deleted; -} - int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, @@ -513,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 1; } +/* + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some + * cases + */ +static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) +{ + int err = 0; + + /* + * At this point only two things can be operating on the journal. + * JBD2 thread performing transaction commit and s_sb_upd_work + * issuing sb update through the journal. Once we set + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not + * queue s_sb_upd_work and ext4_force_commit() makes sure any + * ext4_handle_error() calls from the running transaction commit are + * finished. Hence no new s_sb_upd_work can be queued after we + * flush it here. + */ + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); + + ext4_force_commit(sbi->s_sb); + flush_work(&sbi->s_sb_upd_work); + + err = jbd2_journal_destroy(journal); + sbi->s_journal = NULL; + + return err; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a07a98a4b97a..c616a16a9f36 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -63,7 +63,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); @@ -77,7 +77,7 @@ static void ext4_extent_block_csum_set(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); @@ -4568,131 +4568,65 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; - unsigned int max_blocks; loff_t new_size = 0; - int ret = 0; - int flags; - int credits; - int partial_begin, partial_end; - loff_t start, end; - ext4_lblk_t lblk; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; + unsigned int blocksize = i_blocksize(inode); unsigned int blkbits = inode->i_blkbits; + int ret, flags, credits; trace_ext4_zero_range(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); - /* - * Round up offset. This is not fallocate, we need to zero out - * blocks, so convert interior block aligned part of the range to - * unwritten and possibly manually zero out unaligned parts of the - * range. Here, start and partial_begin are inclusive, end and - * partial_end are exclusive. - */ - start = round_up(offset, 1 << blkbits); - end = round_down((offset + len), 1 << blkbits); - - if (start < offset || end > offset + len) - return -EINVAL; - partial_begin = offset & ((1 << blkbits) - 1); - partial_end = (offset + len) & ((1 << blkbits) - 1); - - lblk = start >> blkbits; - max_blocks = (end >> blkbits); - if (max_blocks < lblk) - max_blocks = 0; - else - max_blocks -= lblk; - - inode_lock(inode); - - /* - * Indirect files do not support unwritten extents - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out_mutex; - } + /* Indirect files do not support unwritten extents */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; ret = inode_newsize_ok(inode, new_size); if (ret) - goto out_mutex; + return ret; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - /* Preallocate the range including the unaligned edges */ - if (partial_begin || partial_end) { - ret = ext4_alloc_file_blocks(file, - round_down(offset, 1 << blkbits) >> blkbits, - (round_up((offset + len), 1 << blkbits) - - round_down(offset, 1 << blkbits)) >> blkbits, - new_size, flags); - if (ret) - goto out_mutex; + if (!IS_ALIGNED(offset | end, blocksize)) { + ext4_lblk_t alloc_lblk = offset >> blkbits; + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, + new_size, flags); + if (ret) + return ret; } - /* Zero range excluding the unaligned edges */ - if (max_blocks > 0) { - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | - EXT4_EX_NOCACHE); - - /* - * Prevent page faults from reinstantiating pages we have - * released from page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - ret = ext4_update_disksize_before_punch(inode, offset, len); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - /* - * For journalled data we need to write (and checkpoint) pages - * before discarding page cache to avoid inconsitent data on - * disk in case of crash before zeroing trans is committed. - */ - if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, - end - 1); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - } + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) + return ret; - /* Now release the pages and zero block aligned part of pages */ - truncate_pagecache_range(inode, start, end - 1); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + /* Now release the pages and zero block aligned part of pages */ + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags); - filemap_invalidate_unlock(mapping); + /* Zero range excluding the unaligned edges */ + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> blkbits; + if (end_lblk > start_lblk) { + ext4_lblk_t zero_blks = end_lblk - start_lblk; + + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, + new_size, flags); if (ret) - goto out_mutex; + return ret; } - if (!partial_begin && !partial_end) - goto out_mutex; + /* Finish zeroing out if it doesn't contain partial block */ + if (IS_ALIGNED(offset | end, blocksize)) + return ret; /* * In worst case we have to writeout two nonadjacent unwritten @@ -4705,27 +4639,69 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); - goto out_mutex; + return ret; } - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret) + goto out_handle; + if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(handle, inode, offset, len); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); -out_mutex: - inode_unlock(inode); + return ret; +} + +static long ext4_do_fallocate(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + loff_t end = offset + len; + loff_t new_size = 0; + ext4_lblk_t start_lblk, len_lblk; + int ret; + + trace_ext4_fallocate_enter(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); + + start_lblk = offset >> inode->i_blkbits; + len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); + + /* We only support preallocation for extent-based files only. */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; + } + + ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); + } +out: + trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); return ret; } @@ -4739,12 +4715,8 @@ out_mutex: long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - loff_t new_size = 0; - unsigned int max_blocks; - int ret = 0; - int flags; - ext4_lblk_t lblk; - unsigned int blkbits = inode->i_blkbits; + struct address_space *mapping = file->f_mapping; + int ret; /* * Encrypted inodes can't handle collapse range or insert @@ -4764,73 +4736,47 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) inode_lock(inode); ret = ext4_convert_inline_data(inode); - inode_unlock(inode); if (ret) - goto exit; - - if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(file, offset, len); - goto exit; - } + goto out_inode_lock; - if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(file, offset, len); - goto exit; - } + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); - if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(file, offset, len); - goto exit; - } + ret = file_modified(file); + if (ret) + goto out_inode_lock; - if (mode & FALLOC_FL_ZERO_RANGE) { - ret = ext4_zero_range(file, offset, len, mode); - goto exit; + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) { + ret = ext4_do_fallocate(file, offset, len, mode); + goto out_inode_lock; } - trace_ext4_fallocate_enter(inode, offset, len, mode); - lblk = offset >> blkbits; - - max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - - inode_lock(inode); /* - * We only support preallocation for extent-based files only + * Follow-up operations will drop page cache, hold invalidate lock + * to prevent page faults from reinstantiating pages we have + * released from page cache. */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out; - } - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; - ret = inode_newsize_ok(inode, new_size); - if (ret) - goto out; - } - - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); + filemap_invalidate_lock(mapping); - ret = file_modified(file); + ret = ext4_break_layouts(inode); if (ret) - goto out; + goto out_invalidate_lock; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - if (ret) - goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = ext4_punch_hole(file, offset, len); + else if (mode & FALLOC_FL_COLLAPSE_RANGE) + ret = ext4_collapse_range(file, offset, len); + else if (mode & FALLOC_FL_INSERT_RANGE) + ret = ext4_insert_range(file, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + ret = ext4_zero_range(file, offset, len, mode); + else + ret = -EOPNOTSUPP; - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { - ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, - EXT4_I(inode)->i_sync_tid); - } -out: +out_invalidate_lock: + filemap_invalidate_unlock(mapping); +out_inode_lock: inode_unlock(inode); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); -exit: return ret; } @@ -5332,109 +5278,72 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t punch_start, punch_stop; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; handle_t *handle; unsigned int credits; - loff_t new_size, ioffset; + loff_t start, new_size; int ret; - /* - * We need to test this early because xfstests assumes that a - * collapse range of (0, 1) will return EOPNOTSUPP if the file - * system does not support collapse range. - */ + trace_ext4_collapse_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_collapse_range(inode, offset, len); - - punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); - punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_mmap; + if (end >= inode->i_size) + return -EINVAL; /* + * Write tail of the last page before removed range and data that + * will be shifted since they will get removed from the page cache + * below. We are also protected from pages becoming dirty by + * i_rwsem and invalidate_lock. * Need to round down offset to be aligned with page size boundary * for page size > block size. */ - ioffset = round_down(offset, PAGE_SIZE); - /* - * Write tail of the last page before removed range since it will get - * removed from the page cache below. - */ - ret = filemap_write_and_wait_range(mapping, ioffset, offset); - if (ret) - goto out_mmap; - /* - * Write data that will be shifted to preserve them when discarding - * page cache below. We are also protected from pages becoming dirty - * by i_rwsem and invalidate_lock. - */ - ret = filemap_write_and_wait_range(mapping, offset + len, - LLONG_MAX); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, offset); + if (!ret) + ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX); if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + return ret; + + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); + start_lblk = offset >> inode->i_blkbits; + end_lblk = (offset + len) >> inode->i_blkbits; + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); - ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } ext4_discard_preallocations(inode); - ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start, SHIFT_LEFT); + ret = ext4_ext_shift_extents(inode, handle, end_lblk, + end_lblk - start_lblk, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } new_size = inode->i_size - len; @@ -5442,18 +5351,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_handle; + ext4_update_inode_fsync_trans(handle, inode, 1); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -5473,100 +5380,63 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; - ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; - int ret = 0, depth, split_flag = 0; - loff_t ioffset; + int ret, depth, split_flag = 0; + loff_t start; - /* - * We need to test this early because xfstests assumes that an - * insert range of (0, 1) will return EOPNOTSUPP if the file - * system does not support insert range. - */ + trace_ext4_insert_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_insert_range(inode, offset, len); - - offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); - len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Check whether the maximum file size would be exceeded */ - if (len > inode->i_sb->s_maxbytes - inode->i_size) { - ret = -EFBIG; - goto out_mutex; - } - /* Offset must be less than i_size */ - if (offset >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; + if (offset >= inode->i_size) + return -EINVAL; + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) + return -EFBIG; /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. + * Write out all dirty pages. Need to round down to align start offset + * to page size boundary for page size > block size. */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX); if (ret) - goto out_mmap; + return ret; - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); if (ret) - goto out_stop; + goto out_handle; + + start_lblk = offset >> inode->i_blkbits; + len_lblk = len >> inode->i_blkbits; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - path = ext4_find_extent(inode, offset_lblk, NULL, 0); + path = ext4_find_extent(inode, start_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); - goto out_stop; + goto out_handle; } depth = ext_depth(inode); @@ -5576,16 +5446,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) ee_len = ext4_ext_get_actual_len(extent); /* - * If offset_lblk is not the starting block of extent, split - * the extent @offset_lblk + * If start_lblk is not the starting block of extent, split + * the extent @start_lblk */ - if ((offset_lblk > ee_start_lblk) && - (offset_lblk < (ee_start_lblk + ee_len))) { + if ((start_lblk > ee_start_lblk) && + (start_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; path = ext4_split_extent_at(handle, inode, path, - offset_lblk, split_flag, + start_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); @@ -5594,32 +5464,29 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); - goto out_stop; + goto out_handle; } } ext4_free_ext_path(path); - ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); /* - * if offset_lblk lies in a hole which is at start of file, use + * if start_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, - max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); - + max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ae29832aab1e..d1401d4a5513 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1551,7 +1551,6 @@ retry: ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); - return; } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3bd96c3d4cd0..beb078ee4811 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -688,10 +688,12 @@ out: static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { + int ret; struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) @@ -700,7 +702,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_ATOMIC) { size_t len = iov_iter_count(from); - int ret; if (len < EXT4_SB(inode->i_sb)->s_awu_min || len > EXT4_SB(inode->i_sb)->s_awu_max) @@ -800,11 +801,16 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { + int ret; struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + if (file->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; /* * We don't support synchronous mappings for non-DAX files and @@ -835,7 +841,8 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; - if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); @@ -878,8 +885,12 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + if (filp->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index b40d3b29f7e5..e476c6de3074 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -132,20 +132,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file_enter(file, datasync); - if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_ext4_flags value */ - smp_rmb(); - if (ext4_forced_shutdown(inode->i_sb)) - ret = -EROFS; + if (sb_rdonly(inode->i_sb)) goto out; - } if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index deabe29da7fb..33cd5b6b02d5 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -302,7 +302,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { - buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); + buff = kzalloc(PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21d228073d79..38bc8d74f4cc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -951,8 +951,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sb))) - return ERR_PTR(-EIO); + ret2 = ext4_emergency_state(sb); + if (unlikely(ret2)) + return ERR_PTR(ret2); ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -1282,7 +1283,7 @@ got: inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); @@ -1298,7 +1299,7 @@ got: ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb) && - (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) + (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3536ca7e4fcc..f608f6554b95 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -20,6 +20,11 @@ #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 + +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + void **fsdata); + static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) @@ -228,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); @@ -653,91 +658,109 @@ out_nofolio: } /* - * Try to write data in the inode. - * If the inode has inline data, check whether the new write can be - * in the inode also. If not, create the page the handle, move the data - * to the page make it update and let the later codes create extent for it. + * Prepare the write for the inline data. + * If the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). */ -int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop) +int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; - - if (pos + len > ext4_get_max_inline_size(inode)) - goto convert; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; - /* - * The possible write could happen in the inode, - * so try to reserve the space in inode first. - */ +retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - handle = NULL; - goto out; + goto out_release_bh; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) - goto out; + goto out_stop_journal; - /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); - brelse(iloc.bh); - goto convert; - } + if (!da) { + brelse(iloc.bh); + /* Retry inside */ + return ext4_convert_inline_data_to_extent(mapping, inode); + } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out; + ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + goto out_release_bh; + } folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); - goto out; + goto out_stop_journal; } - *foliop = folio; down_read(&EXT4_I(inode)->xattr_sem); + /* Someone else had converted it to extent */ if (!ext4_has_inline_data(inode)) { ret = 0; - folio_unlock(folio); - folio_put(folio); - goto out_up_read; + goto out_release_folio; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) { - folio_unlock(folio); - folio_put(folio); - goto out_up_read; - } + if (ret < 0) + goto out_release_folio; } - ret = 1; - handle = NULL; -out_up_read: + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); + if (ret) + goto out_release_folio; + *foliop = folio; up_read(&EXT4_I(inode)->xattr_sem); -out: - if (handle && (ret != 1)) - ext4_journal_stop(handle); + brelse(iloc.bh); + return 1; + +out_release_folio: + up_read(&EXT4_I(inode)->xattr_sem); + folio_unlock(folio); + folio_put(folio); +out_stop_journal: + ext4_journal_stop(handle); +out_release_bh: brelse(iloc.bh); return ret; -convert: - return ext4_convert_inline_data_to_extent(mapping, inode); +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop) +{ + if (pos + len > ext4_get_max_inline_size(inode)) + return ext4_convert_inline_data_to_extent(mapping, inode); + return ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, NULL, false); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, @@ -881,94 +904,6 @@ out: return ret; } -/* - * Prepare the write for the inline data. - * If the data can be written into the inode, we just read - * the page and make it uptodate, and start the journal. - * Otherwise read the page, makes it dirty so that it can be - * handle in writepages(the i_disksize update is left to the - * normal ext4_da_write_end). - */ -int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop, - void **fsdata) -{ - int ret; - handle_t *handle; - struct folio *folio; - struct ext4_iloc iloc; - int retries = 0; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out_journal; - - if (ret == -ENOSPC) { - ext4_journal_stop(handle); - ret = ext4_da_convert_inline_data_to_extent(mapping, - inode, - fsdata); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - goto out; - } - - /* - * We cannot recurse into the filesystem as the transaction - * is already started. - */ - folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) { - ret = PTR_ERR(folio); - goto out_journal; - } - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - ret = 0; - goto out_release_page; - } - - if (!folio_test_uptodate(folio)) { - ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) - goto out_release_page; - } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out_release_page; - - up_read(&EXT4_I(inode)->xattr_sem); - *foliop = folio; - brelse(iloc.bh); - return 1; -out_release_page: - up_read(&EXT4_I(inode)->xattr_sem); - folio_unlock(folio); - folio_put(folio); -out_journal: - ext4_journal_stop(handle); -out: - brelse(iloc.bh); - return ret; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) @@ -1012,7 +947,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + err = ext4_find_dest_de(dir, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; @@ -1146,7 +1081,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c54ae5fcbd4..bcb96caf77c0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> @@ -93,7 +94,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); @@ -114,7 +115,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); @@ -751,7 +752,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ - if (!bh->b_page) { + if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } @@ -1149,8 +1150,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; trace_ext4_write_begin(inode, pos, len); /* @@ -2225,7 +2227,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } - ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); @@ -2273,7 +2275,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2599,10 +2601,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { - ret = -EROFS; + ret = ext4_emergency_state(mapping->host->i_sb); + if (unlikely(ret)) goto out_writepages; - } /* * If we have inline data and arrive here, it means that @@ -2817,8 +2818,9 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); @@ -2858,8 +2860,9 @@ static int ext4_dax_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); @@ -2915,8 +2918,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; index = pos >> PAGE_SHIFT; @@ -2929,8 +2933,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { - ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - foliop, fsdata); + ret = ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) @@ -3290,6 +3294,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; + /* HW-offload atomics are always used */ + if (flags & IOMAP_ATOMIC) + iomap->flags |= IOMAP_F_ATOMIC_BIO; + if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else @@ -3902,6 +3910,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); @@ -3946,91 +4016,50 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset, max_length; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t start_lblk, end_lblk; + loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + loff_t end = offset + length; handle_t *handle; unsigned int credits; - int ret = 0, ret2 = 0; + int ret; trace_ext4_punch_hole(inode, offset, length, 0); - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - - inode_lock(inode); + WARN_ON_ONCE(!inode_is_locked(inode)); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) - goto out_mutex; + return 0; /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size + * If the hole extends beyond i_size, set the hole to end after + * the page that contains i_size, and also make sure that the hole + * within one block before last range. */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - - offset; - } + if (end > inode->i_size) + end = round_up(inode->i_size, PAGE_SIZE); + if (end > max_end) + end = max_end; + length = end - offset; /* - * For punch hole the length + offset needs to be within one block - * before last range. Adjust the length if it goes beyond that limit. + * Attach jinode to inode for jbd2 if we do any zeroing of partial + * block. */ - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; - if (offset + length > max_length) - length = max_length - offset; - - if (offset & (sb->s_blocksize - 1) || - (offset + length) & (sb->s_blocksize - 1)) { - /* - * Attach jinode to inode for jbd2 if we do any zeroing of - * partial block - */ + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) - goto out_mutex; - + return ret; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - ret = ext4_break_layouts(inode); + ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) - goto out_dio; - - first_block_offset = round_up(offset, sb->s_blocksize); - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + return ret; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) { - ret = ext4_update_disksize_before_punch(inode, offset, length); - if (ret) - goto out_dio; - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - } + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -4040,54 +4069,51 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); - goto out_dio; + return ret; } - ret = ext4_zero_partial_blocks(handle, inode, offset, - length); + ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) - goto out_stop; - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + goto out_handle; /* If there are blocks to remove, do it */ - if (stop_block > first_block) { - ext4_lblk_t hole_len = stop_block - first_block; + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> inode->i_blkbits; + + if (end_lblk > start_lblk) { + ext4_lblk_t hole_len = end_lblk - start_lblk; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, hole_len); + ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); + ret = ext4_ext_remove_space(inode, start_lblk, + end_lblk - 1); else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + ret = ext4_ind_remove_space(handle, inode, start_lblk, + end_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } - ext4_es_insert_extent(inode, first_block, hole_len, ~0, + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(handle, inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, start_lblk, end_lblk); + + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - ret2 = ext4_mark_inode_dirty(handle, inode); - if (unlikely(ret2)) - ret = ret2; - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_dio: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -4674,6 +4700,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode, *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; + err = xattr_check_inode(inode, IHDR(inode, raw_inode), + ITAIL(inode, raw_inode)); + if (err) + return err; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) @@ -4800,7 +4831,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); @@ -4887,7 +4918,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ - if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && + if (!ext4_has_feature_dir_index(sb) && + ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); @@ -5007,8 +5039,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; - nd_terminate_link(ei->i_data, inode->i_size, - sizeof(ei->i_data) - 1); + if (inode->i_size == 0 || + inode->i_size >= sizeof(ei->i_data) || + strnlen((char *)ei->i_data, inode->i_size + 1) != + inode->i_size) { + ext4_error_inode(inode, function, line, 0, + "invalid fast symlink length %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto bad_inode; + } inode_set_cached_link(inode, (char *)ei->i_data, inode->i_size); } else { @@ -5228,8 +5268,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { @@ -5351,8 +5392,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + error = ext4_emergency_state(inode->i_sb); + if (unlikely(error)) + return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -5464,7 +5506,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, oldsize & (inode->i_sb->s_blocksize - 1)) { error = ext4_inode_attach_jinode(inode); if (error) - goto err_out; + goto out_mmap_sem; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); @@ -5796,9 +5838,10 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) { + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) { put_bh(iloc->bh); - return -EIO; + return err; } ext4_fc_track_inode(handle, inode); @@ -5822,8 +5865,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; err = ext4_get_inode_loc(inode, iloc); if (!err) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7b9ce71c1c81..d17207386ead 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -142,7 +142,7 @@ static int ext4_update_backup_sb(struct super_block *sb, es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); - if (ext4_has_metadata_csum(sb) && + if (ext4_has_feature_metadata_csum(sb) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); @@ -150,7 +150,7 @@ static int ext4_update_backup_sb(struct super_block *sb, goto out_bh; } func(es, arg); - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(sb, es); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -351,7 +351,7 @@ void ext4_reset_inode_seed(struct inode *inode) __le32 gen = cpu_to_le32(inode->i_generation); __u32 csum; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); @@ -1205,7 +1205,8 @@ static int ext4_ioctl_setuuid(struct file *filp, * If any checksums (group descriptors or metadata) are being used * then the checksum seed feature is required to change the UUID. */ - if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb)) + if (((ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb)) && !ext4_has_feature_csum_seed(sb)) || ext4_has_feature_stable_inodes(sb)) return -EOPNOTSUPP; @@ -1253,7 +1254,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!inode_owner_or_capable(idmap, inode)) return -EPERM; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; @@ -1705,7 +1706,7 @@ int ext4_update_overhead(struct super_block *sb, bool force) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sb_rdonly(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb)) return 0; if (!force && (sbi->s_overhead == 0 || diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index bb2a223b207c..d634c12f1984 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -796,6 +796,7 @@ static void test_mb_mark_used(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); @@ -860,6 +861,7 @@ static void test_mb_free_blocks(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b25a27c86696..0d523e9fb3d5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -187,7 +187,7 @@ * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req - * /sys/fs/ext4/<partition>/mb_linear_limit + * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -209,7 +209,7 @@ * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for - * rotational devices that may result in higher seek times. "mb_linear_limit" + * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices @@ -5653,7 +5653,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5687,7 +5687,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index d64c04ed061a..3e26464b1425 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -21,7 +21,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); @@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); @@ -162,7 +162,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { + while (!kthread_should_stop() && !ext4_emergency_state(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 536d56d15072..cb5cb33b1d91 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -176,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, brelse(bh); return ERR_PTR(-EFSCORRUPTED); } - if (!ext4_has_metadata_csum(inode->i_sb) || + if (!ext4_has_feature_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; @@ -291,36 +291,6 @@ struct dx_tail { __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; -static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); -static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); -static inline unsigned dx_get_hash(struct dx_entry *entry); -static void dx_set_hash(struct dx_entry *entry, unsigned value); -static unsigned dx_get_count(struct dx_entry *entries); -static unsigned dx_get_limit(struct dx_entry *entries); -static void dx_set_count(struct dx_entry *entries, unsigned value); -static void dx_set_limit(struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit(struct inode *dir, unsigned infosize); -static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(struct ext4_filename *fname, - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame); -static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct inode *dir, struct buffer_head *bh, - struct dx_hash_info *hinfo, - struct dx_map_entry *map_tail); -static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, - char *to, struct dx_map_entry *offsets, - int count, unsigned int blocksize); -static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, - unsigned int blocksize); -static void dx_insert_block(struct dx_frame *frame, - u32 hash, ext4_lblk_t block); -static int ext4_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); @@ -398,7 +368,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); @@ -419,7 +389,7 @@ static void ext4_dirblock_csum_set(struct inode *inode, { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); @@ -494,7 +464,7 @@ static int ext4_dx_csum_verify(struct inode *inode, struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -523,7 +493,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -612,7 +582,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -622,7 +592,7 @@ static inline unsigned dx_node_limit(struct inode *dir) unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -1076,7 +1046,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; - int csum = ext4_has_metadata_csum(dir->i_sb); + int csum = ext4_has_feature_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -1320,7 +1290,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info h = *hinfo; int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { @@ -1462,7 +1432,8 @@ static bool ext4_match(struct inode *parent, * sure cf_name was properly initialized before * considering the calculated hash. */ - if (IS_ENCRYPTED(parent) && fname->cf_name.name && + if (sb_no_casefold_compat_fallback(parent->i_sb) && + IS_ENCRYPTED(parent) && fname->cf_name.name && (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) return false; @@ -1595,10 +1566,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + else if (!sb_no_casefold_compat_fallback(dir->i_sb) && + *res_dir == NULL && IS_CASEFOLDED(dir)) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold " + "failed, falling back\n")); + else goto cleanup_and_exit; - dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " - "falling back\n")); ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); @@ -1945,7 +1921,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, int csum_size = 0; int err = 0, i; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); @@ -2060,8 +2036,7 @@ out: return ERR_PTR(err); } -int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) @@ -2143,11 +2118,11 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, int csum_size = 0; int err, err2; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + err = ext4_find_dest_de(dir, bh, bh->b_data, blocksize - csum_size, fname, &de); if (err) return err; @@ -2252,7 +2227,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, struct fake_dirent *fde; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; @@ -2396,7 +2371,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_lblk_t block, blocks; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; @@ -2427,7 +2402,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; /* Can we just ignore htree data? */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { EXT4_ERROR_INODE(dir, "Directory has corrupted htree index."); retval = -EFSCORRUPTED; @@ -2577,8 +2552,10 @@ again: BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, frame->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); @@ -2589,8 +2566,10 @@ again: err = ext4_journal_get_write_access(handle, sb, (frame - 1)->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } memcpy((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); @@ -2609,8 +2588,10 @@ again: dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } brelse (bh2); err = ext4_handle_dirty_dx_node(handle, dir, (frame - 1)->bh); @@ -2635,8 +2616,10 @@ again: "Creating %d level index...\n", dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } err = ext4_handle_dirty_dx_node(handle, dir, bh2); brelse(bh2); restart = 1; @@ -2733,7 +2716,7 @@ static int ext4_delete_entry(handle_t *handle, return err; } - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); @@ -2973,7 +2956,7 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, int csum_size = 0; int err; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -3004,19 +2987,19 @@ out: return err; } -static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) - return -EMLINK; + return ERR_PTR(-EMLINK); err = dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -3066,7 +3049,7 @@ out_stop: out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return ERR_PTR(err); } /* @@ -3151,8 +3134,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; /* Initialize quotas before so that eventual writes go in * separate transaction */ @@ -3309,8 +3293,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; trace_ext4_unlink_enter(dir, dentry); /* @@ -3376,8 +3361,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + err = ext4_emergency_state(dir->i_sb); + if (unlikely(err)) + return err; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); @@ -4199,8 +4185,9 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) - return -EIO; + err = ext4_emergency_state(old_dir->i_sb); + if (unlikely(err)) + return err; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index e5b47dda3317..c66e0cb29bd4 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -537,7 +537,7 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 69b8a7221a2b..179e54f3a3b6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) } /* - * Check a range of space and convert unwritten extents to written. Note that + * On successful IO, check a range of space and convert unwritten extents to + * written. On IO failure, check if journal abort is needed. Note that * we are protected from truncate touching same part of extent tree by the * fact that truncate code waits for all DIO to finish (thus exclusion from * direct IO is achieved) and also waits for PageWriteback bits. Thus we @@ -175,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; handle_t *handle = io_end->handle; + struct super_block *sb = inode->i_sb; int ret = 0; ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { - ext4_msg(inode->i_sb, KERN_EMERG, + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + + if (test_opt(sb, DATA_ERR_ABORT)) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } + if (ret < 0 && !ext4_emergency_state(sb) && + io_end->flag & EXT4_IO_END_UNWRITTEN) { + ext4_msg(sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } + ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; @@ -217,6 +234,16 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) #endif } +static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) +{ + if (io_end->flag & EXT4_IO_END_UNWRITTEN) + return true; + if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && + io_end->flag & EXT4_IO_END_FAILED) + return true; + return false; +} + /* Add the io_end to per-inode completed end_io list. */ static void ext4_add_complete_io(ext4_io_end_t *io_end) { @@ -225,9 +252,11 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - /* Only reserved conversions from writeback should enter here */ - WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - WARN_ON(!io_end->handle && sbi->s_journal); + /* Only reserved conversions or pending IO errors will enter here. */ + WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && + !io_end->handle && sbi->s_journal); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; if (list_empty(&ei->i_rsv_conversion_list)) @@ -252,7 +281,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, while (!list_empty(&unwritten)) { io_end = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); list_del_init(&io_end->list); err = ext4_end_io_end(io_end); @@ -263,7 +292,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode, } /* - * work on completed IO, to convert unwritten extents to extents + * Used to convert unwritten extents to written extents upon IO completion, + * or used to abort the journal upon IO errors. */ void ext4_end_io_rsv_work(struct work_struct *work) { @@ -288,29 +318,25 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || - list_empty(&io_end->list_vec)) { - ext4_release_io_end(io_end); + if (io_end->flag & EXT4_IO_END_FAILED || + (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec))) { + ext4_add_complete_io(io_end); return; } - ext4_add_complete_io(io_end); + ext4_release_io_end(io_end); } } int ext4_put_io_end(ext4_io_end_t *io_end) { - int err = 0; - if (refcount_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_io_end_vec(io_end->handle, - io_end); - io_end->handle = NULL; - ext4_clear_io_unwritten_flag(io_end); - } + if (ext4_io_end_defer_completion(io_end)) + return ext4_end_io_end(io_end); + ext4_release_io_end(io_end); } - return err; + return 0; } ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) @@ -344,11 +370,12 @@ static void ext4_end_bio(struct bio *bio) bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (ext4_io_end_defer_completion(io_end)) { /* * Link bio into list hanging from io_end. We have to do it * atomically as bio completions can be racing against each @@ -522,7 +549,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, if (io->io_bio) gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 72f77f78ae8d..b7ff0d955f0d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1118,7 +1118,7 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, struct ext4_super_block *es = (struct ext4_super_block *) data; es->s_block_group_nr = cpu_to_le16(group); - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(sb, es); } @@ -1315,7 +1315,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, { struct buffer_head *bh; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a50e5c31b937..8122d4ffb3b5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -79,7 +79,6 @@ static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); -static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, @@ -302,7 +301,7 @@ __le32 ext4_superblock_csum(struct super_block *sb, static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); @@ -312,7 +311,7 @@ void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); @@ -448,9 +447,6 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) -#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ -#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ - /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. @@ -458,8 +454,10 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: - * 1. More than an hour has passed since the last superblock update, and - * 2. More than 16MB have been written since the last superblock update. + * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last + * superblock update + * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the + * last superblock update. * * @sb: The superblock */ @@ -473,14 +471,15 @@ static void ext4_maybe_update_superblock(struct super_block *sb) __u64 lifetime_write_kbytes; __u64 diff_size; - if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || - !journal || (journal->j_flags & JBD2_UNMOUNT)) + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !(sb->s_flags & SB_ACTIVE) || !journal || + journal->j_flags & JBD2_UNMOUNT) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); - if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) + if (likely(now - last_update < sbi->s_sb_update_sec)) return; lifetime_write_kbytes = sbi->s_kbytes_written + @@ -495,32 +494,18 @@ static void ext4_maybe_update_superblock(struct super_block *sb) */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); - if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) + if (diff_size > sbi->s_sb_update_kb) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); - - spin_lock(&sbi->s_md_lock); - while (!list_empty(&txn->t_private_list)) { - jce = list_entry(txn->t_private_list.next, - struct ext4_journal_cb_entry, jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - jce->jce_func(sb, jce, error); - spin_lock(&sbi->s_md_lock); - } - spin_unlock(&sbi->s_md_lock); } /* @@ -707,11 +692,8 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); - if (!continue_fs && !sb_rdonly(sb)) { - set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); - if (journal) - jbd2_journal_abort(journal, -EIO); - } + if (!continue_fs && !ext4_emergency_ro(sb) && journal) + jbd2_journal_abort(journal, -EIO); if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); @@ -719,9 +701,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we - * defer superblock flushing to a workqueue. + * defer superblock flushing to a workqueue. We just need to be + * careful when the journal is already shutting down. If we get + * here in that case, just update the sb directly as the last + * transaction won't commit anyway. */ - if (continue_fs && journal) + if (continue_fs && journal && + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); @@ -737,17 +723,17 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_id); } - if (sb_rdonly(sb) || continue_fs) + if (ext4_emergency_ro(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* - * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem - * modifications. We don't set SB_RDONLY because that requires - * sb->s_umount semaphore and setting it without proper remount - * procedure is confusing code such as freeze_super() leading to - * deadlocks and other problems. + * We don't set SB_RDONLY because that requires sb->s_umount + * semaphore and setting it without proper remount procedure is + * confusing code such as freeze_super() leading to deadlocks + * and other problems. */ + set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } static void update_super_work(struct work_struct *work) @@ -765,7 +751,8 @@ static void update_super_work(struct work_struct *work) * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ - if (!sb_rdonly(sbi->s_sb) && journal) { + if (!ext4_emergency_state(sbi->s_sb) && + !sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; @@ -819,7 +806,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -844,7 +831,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -879,7 +866,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -959,7 +946,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -1053,7 +1040,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -1306,18 +1293,17 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } - } + } else + flush_work(&sbi->s_sb_upd_work); ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); @@ -1325,13 +1311,14 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!sb_rdonly(sb) && !aborted) { - ext4_clear_feature_journal_needs_recovery(sb); - ext4_clear_feature_orphan_present(sb); - es->s_state = cpu_to_le16(sbi->s_mount_state); - } - if (!sb_rdonly(sb)) + if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) { + if (!aborted) { + ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); + es->s_state = cpu_to_le16(sbi->s_mount_state); + } ext4_commit_super(sb); + } ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); @@ -1426,7 +1413,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; - atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); @@ -2785,6 +2771,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, } if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " @@ -3038,6 +3031,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) SEQ_OPTS_PUTS("prefetch_block_bitmaps"); + if (ext4_emergency_ro(sb)) + SEQ_OPTS_PUTS("emergency_ro"); + + if (ext4_forced_shutdown(sb)) + SEQ_OPTS_PUTS("shutdown"); + ext4_show_quota_options(seq, sb); return 0; } @@ -3205,7 +3204,7 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sbi->s_sb)) { + if (ext4_has_feature_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; @@ -3693,7 +3692,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && - !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { + !ext4_emergency_state(sb) && !sb_rdonly(sb) && + test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; @@ -3998,7 +3998,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (sb_rdonly(sb) || + if (ext4_emergency_state(sb) || sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; @@ -4061,7 +4061,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; @@ -4349,7 +4349,7 @@ static void ext4_set_def_opts(struct super_block *sb, if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) @@ -4642,7 +4642,8 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) + else if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); return 0; @@ -4973,10 +4974,7 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_sb_upd_work before destroying the journal. */ - flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); return -EINVAL; } @@ -5013,6 +5011,24 @@ static int ext4_check_journal_data_mode(struct super_block *sb) return 0; } +static const char *ext4_has_journal_option(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + return "journal_async_commit"; + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) + return "journal_checksum"; + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + return "commit="; + if (EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) + return "data="; + if (test_opt(sb, DATA_ERR_ABORT)) + return "data_err=abort"; + return NULL; +} + static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { @@ -5263,6 +5279,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB; + sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC; /* * set default s_li_wait_mult for lazyinit, for the case there is @@ -5404,30 +5422,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) "suppressed and not mounted read-only"); goto failed_mount3a; } else { + const char *journal_option; + /* Nojournal mode, all journal mount options are illegal */ - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit, fs mounted w/o journal"); + journal_option = ext4_has_journal_option(sb); + if (journal_option != NULL) { + ext4_msg(sb, KERN_ERR, + "can't mount with %s, fs mounted w/o journal", + journal_option); goto failed_mount3a; } - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); - goto failed_mount3a; - } - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "commit=%lu, fs mounted w/o journal", - sbi->s_commit_interval / HZ); - goto failed_mount3a; - } - if (EXT4_MOUNT_DATA_FLAGS & - (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "data=, fs mounted w/o journal"); - goto failed_mount3a; - } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); @@ -5616,9 +5621,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount9; } - if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) { ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); + clear_opt(sb, DISCARD); + } if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -5665,10 +5672,7 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_sb_upd_work before journal destroy. */ - flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); @@ -5773,10 +5777,6 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; - if (test_opt(sb, DATA_ERR_ABORT)) - journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; - else - journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. @@ -5973,7 +5973,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, return journal; out_journal: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); @@ -6090,8 +6090,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { - EXT4_SB(sb)->s_journal = NULL; - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6109,7 +6108,7 @@ static int ext4_load_journal(struct super_block *sb, return 0; err_out: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6336,8 +6335,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); @@ -6419,7 +6419,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return 0; if (EXT4_SB(sb)->s_journal) { @@ -6575,7 +6575,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_forced_shutdown(sb)) { + if (ext4_emergency_state(sb)) { err = -EROFS; goto restore_opts; } @@ -6780,6 +6780,7 @@ static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; + bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); @@ -6791,9 +6792,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", - &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", + &sb->s_uuid, + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } @@ -6817,22 +6818,29 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; - if (limit && buf->f_blocks > limit) { + if (limit) { + uint64_t remaining = 0; + curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; - buf->f_blocks = limit; - buf->f_bfree = buf->f_bavail = - (buf->f_blocks > curblock) ? - (buf->f_blocks - curblock) : 0; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); - if (limit && buf->f_files > limit) { - buf->f_files = limit; - buf->f_ffree = - (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? - (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); @@ -6935,12 +6943,25 @@ static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; + bool freeze_protected = false; + + /* + * Trying to sb_start_intwrite() in a running transaction + * can result in a deadlock. Further, running transactions + * are already protected from freezing. + */ + if (!ext4_journal_current_handle()) { + sb_start_intwrite(dquot->dq_sb); + freeze_protected = true; + } handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); return PTR_ERR(handle); } ret = dquot_release(dquot); @@ -6951,6 +6972,10 @@ static int ext4_release_dquot(struct dquot *dquot) err = ext4_journal_stop(handle); if (!ret) ret = err; + + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); + return ret; } @@ -7288,7 +7313,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); @@ -7381,12 +7406,9 @@ static struct file_system_type ext4_fs_type = { }; MODULE_ALIAS_FS("ext4"); -/* Shared across all ext4 file systems */ -wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - static int __init ext4_init_fs(void) { - int i, err; + int err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; @@ -7394,9 +7416,6 @@ static int __init ext4_init_fs(void) /* Build-time check for flags consistency */ ext4_check_flag_values(); - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) - init_waitqueue_head(&ext4__ioend_wq[i]); - err = ext4_init_es(); if (err) return err; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index ddb54608ca2e..987bd00f916a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -254,6 +254,8 @@ EXT4_ATTR(journal_task, 0444, journal_task); EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); +EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec); +EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -305,6 +307,8 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_prefetch), ATTR_LIST(mb_prefetch_limit), ATTR_LIST(last_trim_minblks), + ATTR_LIST(sb_update_sec), + ATTR_LIST(sb_update_kb), NULL, }; ATTRIBUTE_GROUPS(ext4); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7647e9f6e190..7ab8f2e8e815 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -156,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); @@ -168,7 +168,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } @@ -308,7 +308,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) -static inline int +int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { @@ -316,9 +316,6 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, function, line); } -#define xattr_check_inode(inode, header, end) \ - __xattr_check_inode((inode), (header), (end), __func__, __LINE__) - static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) @@ -649,10 +646,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; + end = ITAIL(inode, raw_inode); entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) @@ -783,7 +777,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; - void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) @@ -793,14 +786,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); -cleanup: brelse(iloc.bh); return error; } @@ -868,7 +856,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; - void *end; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); @@ -879,10 +866,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - ret = xattr_check_inode(inode, header, end); - if (ret) - goto out; for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) @@ -1176,15 +1159,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, { struct inode *ea_inode; struct ext4_xattr_entry *entry; + struct ext4_iloc iloc; bool dirty = false; unsigned int ea_ino; int err; int credits; + void *end; + + if (block_csum) + end = (void *)bh->b_data + bh->b_size; + else { + ext4_get_inode_loc(parent, &iloc); + end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; + } /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; - for (entry = first; !IS_LAST_ENTRY(entry); + for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; @@ -2235,11 +2227,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; - is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + is->s.end = ITAIL(inode, raw_inode); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = xattr_check_inode(inode, header, is->s.end); - if (error) - return error; /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); @@ -2786,14 +2775,10 @@ retry: */ base = IFIRST(header); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + end = ITAIL(inode, raw_inode); min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; - ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index b25c2d7b5f99..1fedf44d4fb6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -67,6 +67,9 @@ struct ext4_xattr_entry { ((void *)raw_inode + \ EXT4_GOOD_OLD_INODE_SIZE + \ EXT4_I(inode)->i_extra_isize)) +#define ITAIL(inode, raw_inode) \ + ((void *)(raw_inode) + \ + EXT4_SB((inode)->i_sb)->s_inode_size) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) /* @@ -206,6 +209,13 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); +extern int +__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line); + +#define xattr_check_inode(inode, header, end) \ + __xattr_check_inode((inode), (header), (end), __func__, __LINE__) + #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index efda9a022981..cf77987d0698 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -21,7 +21,7 @@ #include "iostat.h" #include <trace/events/f2fs.h> -#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3)) static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; @@ -58,7 +58,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); - struct page *page; + struct folio *folio; struct f2fs_io_info fio = { .sbi = sbi, .type = META, @@ -74,37 +74,37 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, if (unlikely(!is_meta)) fio.op_flags &= ~REQ_META; repeat: - page = f2fs_grab_cache_page(mapping, index, false); - if (!page) { + folio = f2fs_grab_cache_folio(mapping, index, false); + if (IS_ERR(folio)) { cond_resched(); goto repeat; } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - fio.page = page; + fio.page = &folio->page; err = f2fs_submit_page_bio(&fio); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return ERR_PTR(err); } f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); - lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + f2fs_folio_put(folio, true); goto repeat; } - if (unlikely(!PageUptodate(page))) { - f2fs_handle_page_eio(sbi, page_folio(page), META); - f2fs_put_page(page, 1); + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_handle_page_eio(sbi, folio, META); + f2fs_folio_put(folio, true); return ERR_PTR(-EIO); } out: - return page; + return &folio->page; } struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) @@ -381,12 +381,6 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) -{ - return __f2fs_write_meta_page(page, wbc, FS_META_IO); -} - static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -507,7 +501,6 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, } const struct address_space_operations f2fs_meta_aops = { - .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, .dirty_folio = f2fs_dirty_meta_folio, .invalidate_folio = f2fs_invalidate_folio, @@ -1237,7 +1230,7 @@ static int block_operations(struct f2fs_sb_info *sbi) retry_flush_quotas: f2fs_lock_all(sbi); if (__need_flush_quota(sbi)) { - int locked; + bool need_lock = sbi->umount_lock_holder != current; if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); @@ -1246,11 +1239,13 @@ retry_flush_quotas: } f2fs_unlock_all(sbi); - /* only failed during mount/umount/freeze/quotactl */ - locked = down_read_trylock(&sbi->sb->s_umount); - f2fs_quota_sync(sbi->sb, -1); - if (locked) + /* don't grab s_umount lock during mount/umount/remount/freeze/quotactl */ + if (!need_lock) { + f2fs_do_quota_sync(sbi->sb, -1); + } else if (down_read_trylock(&sbi->sb->s_umount)) { + f2fs_do_quota_sync(sbi->sb, -1); up_read(&sbi->sb->s_umount); + } cond_resched(); goto retry_flush_quotas; } @@ -1344,21 +1339,13 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; - if (cpc->reason & CP_UMOUNT) { - if (le32_to_cpu(ckpt->cp_pack_total_block_count) + - NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to no space"); - } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && - f2fs_nat_bitmap_enabled(sbi)) { - f2fs_enable_nat_bits(sbi); - set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Rebuild and enable nat_bits"); - } - } - spin_lock_irqsave(&sbi->cp_lock, flags); + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); else @@ -1541,8 +1528,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); /* write nat bits */ - if ((cpc->reason & CP_UMOUNT) && - is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { + if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; @@ -1867,7 +1853,8 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) struct cp_control cpc; cpc.reason = __get_cp_reason(sbi); - if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC || + sbi->umount_lock_holder == current) { int ret; f2fs_down_write(&sbi->gc_lock); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 985690d81a82..9b94810675c1 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1150,6 +1150,7 @@ retry: f2fs_compress_ctx_add_page(cc, page_folio(page)); if (!PageUptodate(page)) { + f2fs_handle_page_eio(sbi, page_folio(page), DATA); release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index de4da6d9cd93..54f89f0ee69b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -319,8 +319,7 @@ static void f2fs_read_end_io(struct bio *bio) static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; iostat_update_and_unbind_ctx(bio); sbi = bio->bi_private; @@ -328,34 +327,41 @@ static void f2fs_write_end_io(struct bio *bio) if (time_to_inject(sbi, FAULT_WRITE_IO)) bio->bi_status = BLK_STS_IOERR; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - enum count_type type = WB_DATA_TYPE(page, false); + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + enum count_type type; + + if (fscrypt_is_bounce_folio(folio)) { + struct folio *io_folio = folio; - fscrypt_finalize_bounce_page(&page); + folio = fscrypt_pagecache_folio(io_folio); + fscrypt_free_bounce_page(&io_folio->page); + } #ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_is_compressed_page(page)) { - f2fs_compress_write_end_io(bio, page); + if (f2fs_is_compressed_page(&folio->page)) { + f2fs_compress_write_end_io(bio, &folio->page); continue; } #endif + type = WB_DATA_TYPE(&folio->page, false); + if (unlikely(bio->bi_status)) { - mapping_set_error(page->mapping, -EIO); + mapping_set_error(folio->mapping, -EIO); if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true, STOP_CP_REASON_WRITE_FAIL); } - f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && - page_folio(page)->index != nid_of_node(page)); + f2fs_bug_on(sbi, folio->mapping == NODE_MAPPING(sbi) && + folio->index != nid_of_node(&folio->page)); dec_page_count(sbi, type); - if (f2fs_in_warm_node_list(sbi, page)) - f2fs_del_fsync_node_entry(sbi, page); - clear_page_private_gcing(page); - end_page_writeback(page); + if (f2fs_in_warm_node_list(sbi, folio)) + f2fs_del_fsync_node_entry(sbi, &folio->page); + clear_page_private_gcing(&folio->page); + folio_end_writeback(folio); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) @@ -413,6 +419,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); + struct folio *fio_folio = page_folio(fio->page); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; @@ -438,6 +445,11 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) op_flags |= REQ_META; if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; + + if (fio->type == DATA && + F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) + op_flags |= REQ_PRIO; + return op_flags; } @@ -876,6 +888,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) struct bio *bio = *fio->bio; struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + struct folio *folio = page_folio(fio->page); if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) @@ -889,8 +902,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); - f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, - page_folio(fio->page)->index, fio, GFP_NOIO); + f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, + folio->index, fio, GFP_NOIO); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { @@ -899,8 +912,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), - PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); @@ -1041,8 +1053,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), REQ_OP_READ | op_flag, for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); - if (!bio) - return ERR_PTR(-ENOMEM); bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); bio->bi_end_io = f2fs_read_end_io; @@ -1193,18 +1203,17 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write, - pgoff_t *next_pgofs) +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; - struct page *page; + struct folio *folio; int err; - page = f2fs_grab_cache_page(mapping, index, for_write); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(mapping, index, for_write); + if (IS_ERR(folio)) + return folio; if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { @@ -1239,9 +1248,9 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, goto put_err; } got_it: - if (PageUptodate(page)) { - unlock_page(page); - return page; + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + return folio; } /* @@ -1252,48 +1261,51 @@ got_it: * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_SIZE); - if (!PageUptodate(page)) - SetPageUptodate(page); - unlock_page(page); - return page; + folio_zero_segment(folio, 0, folio_size(folio)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + folio_unlock(folio); + return folio; } - err = f2fs_submit_page_read(inode, page_folio(page), dn.data_blkaddr, + err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr, op_flags, for_write); if (err) goto put_err; - return page; + return folio; put_err: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return ERR_PTR(err); } -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; - page = find_get_page_flags(mapping, index, FGP_ACCESSED); - if (page && PageUptodate(page)) - return page; - f2fs_put_page(page, 0); + folio = __filemap_get_folio(mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio)) + goto read; + if (folio_test_uptodate(folio)) + return folio; + f2fs_folio_put(folio, false); - page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); - if (IS_ERR(page)) - return page; +read: + folio = f2fs_get_read_data_folio(inode, index, 0, false, next_pgofs); + if (IS_ERR(folio)) + return folio; - if (PageUptodate(page)) - return page; + if (folio_test_uptodate(folio)) + return folio; - wait_on_page_locked(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 0); + folio_wait_locked(folio); + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_folio_put(folio, false); return ERR_PTR(-EIO); } - return page; + return folio; } /* @@ -1301,23 +1313,23 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; - page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); - if (IS_ERR(page)) - return page; + folio = f2fs_get_read_data_folio(inode, index, 0, for_write, NULL); + if (IS_ERR(folio)) + return folio; /* wait for read completion */ - lock_page(page); - if (unlikely(page->mapping != mapping || !PageUptodate(page))) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping || !folio_test_uptodate(folio))) { + f2fs_folio_put(folio, true); return ERR_PTR(-EIO); } - return page; + return folio; } /* @@ -2178,6 +2190,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, int i; int ret = 0; + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + from_dnode = false; + goto out_put_dnode; + } + f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + @@ -2221,10 +2239,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (ret) goto out; - if (unlikely(f2fs_cp_error(sbi))) { - ret = -EIO; - goto out_put_dnode; - } f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); skip_reading_dnode: @@ -2500,7 +2514,7 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio) return 0; retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, + fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page), PAGE_SIZE, 0, gfp_flags); if (IS_ERR(fio->encrypted_page)) { /* flush pending IOs and wait for a while in the ENOMEM case */ @@ -2921,29 +2935,6 @@ redirty_out: return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); -#ifdef CONFIG_F2FS_FS_COMPRESSION - struct inode *inode = folio->mapping->host; - - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) - goto out; - - if (f2fs_compressed_file(inode)) { - if (f2fs_is_compressed_cluster(inode, folio->index)) { - folio_redirty_for_writepage(wbc, folio); - return AOP_WRITEPAGE_ACTIVATE; - } - } -out: -#endif - - return f2fs_write_single_data_page(folio, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0, true); -} - /* * This function was copied from write_cache_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from @@ -3266,10 +3257,6 @@ static int __f2fs_write_data_pages(struct address_space *mapping, int ret; bool locked = false; - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) - return 0; - /* skip writing if there is no dirty page in this inode */ if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; @@ -3390,7 +3377,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, restart: /* check inline_data */ - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -3453,7 +3440,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index, struct page *ipage; int err = 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -3483,7 +3470,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -4101,7 +4088,6 @@ static void f2fs_swap_deactivate(struct file *file) const struct address_space_operations f2fs_dblock_aops = { .read_folio = f2fs_read_data_folio, .readahead = f2fs_readahead, - .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, @@ -4195,7 +4181,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); - if (flags & IOMAP_WRITE) + + /* + * If the blocks being overwritten are already allocated, + * f2fs_map_lock and f2fs_balance_fs are not necessary. + */ + if ((flags & IOMAP_WRITE) && + !f2fs_overwrite_io(inode, offset, length)) map.m_may_create = true; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 468828288a4a..16c2dfb4f595 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -164,6 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndonate_files = sbi->donate_files; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->aw_cnt = atomic_read(&sbi->atomic_files); @@ -501,6 +502,8 @@ static int stat_show(struct seq_file *s, void *v) si->compr_inode, si->compr_blocks); seq_printf(s, " - Swapfile Inode: %u\n", si->swapfile_inode); + seq_printf(s, " - Donate Inode: %u\n", + si->ndonate_files); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 54dd52de7269..5a63ff0df03b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -551,7 +551,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; } } else { - page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); + page = f2fs_get_inode_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1afa7be16e7d..f1576dc6ec67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -62,6 +62,7 @@ enum { FAULT_BLKADDR_VALIDITY, FAULT_BLKADDR_CONSISTENCE, FAULT_NO_SEGMENT, + FAULT_INCONSISTENT_FOOTER, FAULT_MAX, }; @@ -114,6 +115,13 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_GC_MERGE 0x02000000 #define F2FS_MOUNT_COMPRESS_CACHE 0x04000000 #define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000 +#define F2FS_MOUNT_NAT_BITS 0x10000000 +#define F2FS_MOUNT_INLINECRYPT 0x20000000 +/* + * Some f2fs environments expect to be able to pass the "lazytime" option + * string rather than using the MS_LAZYTIME flag, so this must remain. + */ +#define F2FS_MOUNT_LAZYTIME 0x40000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -830,6 +838,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ + unsigned int ioprio_hint; /* hint for IO priority */ struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ @@ -849,6 +858,11 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ + + /* linked in global inode list for cache donation */ + struct list_head gdonate_list; + pgoff_t donate_start, donate_end; /* inclusive */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree[NR_EXTENT_CACHES]; /* cached extent_tree entry */ @@ -1273,6 +1287,7 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ + DONATE_INODE, /* for all inode to donate pages */ NR_INODE_TYPE, }; @@ -1628,6 +1643,9 @@ struct f2fs_sb_info { unsigned int warm_data_age_threshold; unsigned int last_age_weight; + /* control donate caches */ + unsigned int donate_files; + /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ @@ -1659,6 +1677,7 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ struct f2fs_rwsem quota_sem; /* blocking cp for flags */ + struct task_struct *umount_lock_holder; /* s_umount lock holder */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -1800,6 +1819,9 @@ struct f2fs_sb_info { u64 committed_atomic_block; u64 revoked_atomic_block; + /* carve out reserved_blocks from total blocks */ + bool carve_out; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -2015,7 +2037,7 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } -static inline struct f2fs_node *F2FS_NODE(struct page *page) +static inline struct f2fs_node *F2FS_NODE(const struct page *page) { return (struct f2fs_node *)page_address(page); } @@ -2219,6 +2241,36 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem) #endif } +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + unsigned char *nat_bits; + + /* + * In order to re-enable nat_bits we need to call fsck.f2fs by + * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, + * so let's rely on regular fsck or unclean shutdown. + */ + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + nat_bits = NM_I(sbi)->nat_bits; + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); + + kvfree(nat_bits); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { f2fs_down_read(&sbi->cp_rwsem); @@ -2765,33 +2817,46 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } -static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, - pgoff_t index, bool for_write) +static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping, + pgoff_t index, bool for_write) { - struct page *page; + struct folio *folio; unsigned int flags; if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + fgf_t fgf_flags; + if (!for_write) - page = find_get_page_flags(mapping, index, - FGP_LOCK | FGP_ACCESSED); + fgf_flags = FGP_LOCK | FGP_ACCESSED; else - page = find_lock_page(mapping, index); - if (page) - return page; + fgf_flags = FGP_LOCK; + folio = __filemap_get_folio(mapping, index, fgf_flags, 0); + if (!IS_ERR(folio)) + return folio; if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) - return NULL; + return ERR_PTR(-ENOMEM); } if (!for_write) - return grab_cache_page(mapping, index); + return filemap_grab_folio(mapping, index); flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, index); + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); memalloc_nofs_restore(flags); - return page; + return folio; +} + +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + struct folio *folio = f2fs_grab_cache_folio(mapping, index, for_write); + + if (IS_ERR(folio)) + return NULL; + return &folio->page; } static inline struct page *f2fs_pagecache_get_page( @@ -2804,16 +2869,23 @@ static inline struct page *f2fs_pagecache_get_page( return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } -static inline void f2fs_put_page(struct page *page, int unlock) +static inline void f2fs_folio_put(struct folio *folio, bool unlock) { - if (!page) + if (!folio) return; if (unlock) { - f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); - unlock_page(page); + f2fs_bug_on(F2FS_F_SB(folio), !folio_test_locked(folio)); + folio_unlock(folio); } - put_page(page); + folio_put(folio); +} + +static inline void f2fs_put_page(struct page *page, int unlock) +{ + if (!page) + return; + f2fs_folio_put(page_folio(page), unlock); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) @@ -3624,7 +3696,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); -int f2fs_quota_sync(struct super_block *sb, int type); +int f2fs_do_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); @@ -3647,7 +3719,8 @@ struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, + const struct folio *folio); void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); @@ -3662,12 +3735,14 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); +struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino); +struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); int f2fs_move_node_page(struct page *node_page, int gc_type); void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); @@ -3687,7 +3762,6 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); @@ -3758,8 +3832,10 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio); void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt); -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered, bool locked); +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked); +#define f2fs_wait_on_page_writeback(page, type, ordered, locked) \ + f2fs_folio_wait_writeback(page_folio(page), type, ordered, locked) void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); @@ -3871,11 +3947,11 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, - pgoff_t *next_pgofs); -struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); @@ -3902,6 +3978,22 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); extern const struct iomap_ops f2fs_iomap_ops; +static inline struct page *f2fs_find_data_page(struct inode *inode, + pgoff_t index, pgoff_t *next_pgofs) +{ + struct folio *folio = f2fs_find_data_folio(inode, index, next_pgofs); + + return &folio->page; +} + +static inline struct page *f2fs_get_lock_data_page(struct inode *inode, + pgoff_t index, bool for_write) +{ + struct folio *folio = f2fs_get_lock_data_folio(inode, index, for_write); + + return &folio->page; +} + /* * gc.c */ @@ -3966,7 +4058,8 @@ struct f2fs_stat_info { unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; + unsigned int nquota_files, ndonate_files; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; @@ -4231,6 +4324,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, struct shrink_control *sc); unsigned long f2fs_shrink_scan(struct shrinker *shrink, struct shrink_control *sc); +unsigned int f2fs_donate_files(void); +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb); void f2fs_join_shrinker(struct f2fs_sb_info *sbi); void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f92a9fba9991..abbcbb5865a3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -707,31 +707,33 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, loff_t offset = from & (PAGE_SIZE - 1); pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; if (!offset && !cache_only) return 0; if (cache_only) { - page = find_lock_page(mapping, index); - if (page && PageUptodate(page)) + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) + return 0; + if (folio_test_uptodate(folio)) goto truncate_out; - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return 0; } - page = f2fs_get_lock_data_page(inode, index, true); - if (IS_ERR(page)) - return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); + folio = f2fs_get_lock_data_folio(inode, index, true); + if (IS_ERR(folio)) + return PTR_ERR(folio) == -ENOENT ? 0 : PTR_ERR(folio); truncate_out: - f2fs_wait_on_page_writeback(page, DATA, true, true); - zero_user(page, offset, PAGE_SIZE - offset); + f2fs_folio_wait_writeback(folio, DATA, true, true); + folio_zero_segment(folio, offset, folio_size(folio)); /* An encrypted inode should have a key and truncate the last page. */ f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode)); if (!cache_only) - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); return 0; } @@ -759,7 +761,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) if (lock) f2fs_lock_op(sbi); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -1834,18 +1836,32 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, map.m_len = sec_blks; next_alloc: + f2fs_down_write(&sbi->pin_sem); + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (has_not_enough_free_secs(sbi, 0, 0)) { + f2fs_up_write(&sbi->pin_sem); + err = -ENOSPC; + f2fs_warn_ratelimited(sbi, + "ino:%lu, start:%lu, end:%lu, need to trigger GC to " + "reclaim enough free segment when checkpoint is enabled", + inode->i_ino, pg_start, pg_end); + goto out_err; + } + } + if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? ZONED_PIN_SEC_REQUIRED_COUNT : GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); - if (err && err != -ENODATA) + if (err && err != -ENODATA) { + f2fs_up_write(&sbi->pin_sem); goto out_err; + } } - f2fs_down_write(&sbi->pin_sem); - err = f2fs_allocate_pinning_section(sbi); if (err) { f2fs_up_write(&sbi->pin_sem); @@ -2448,6 +2464,52 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) return ret; } +static void f2fs_keep_noreuse_range(struct inode *inode, + loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u64 max_bytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); + u64 start, end; + + if (!S_ISREG(inode->i_mode)) + return; + + if (offset >= max_bytes || len > max_bytes || + (offset + len) > max_bytes) + return; + + start = offset >> PAGE_SHIFT; + end = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + inode_lock(inode); + if (f2fs_is_atomic_file(inode)) { + inode_unlock(inode); + return; + } + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + /* let's remove the range, if len = 0 */ + if (!len) { + if (!list_empty(&F2FS_I(inode)->gdonate_list)) { + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + } + } else { + if (list_empty(&F2FS_I(inode)->gdonate_list)) { + list_add_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + sbi->donate_files++; + } else { + list_move_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + } + F2FS_I(inode)->donate_start = start; + F2FS_I(inode)->donate_end = end - 1; + } + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + inode_unlock(inode); +} + static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -3446,6 +3508,23 @@ static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg) (u32 __user *)arg); } +static int f2fs_ioc_io_prio(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 level; + + if (get_user(level, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode) || level >= F2FS_IOPRIO_MAX) + return -EINVAL; + + inode_lock(inode); + F2FS_I(inode)->ioprio_hint = level; + inode_unlock(inode); + return 0; +} + int f2fs_precache_extents(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -4547,6 +4626,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_compress_file(filp); case F2FS_IOC_GET_DEV_ALIAS_FILE: return f2fs_ioc_get_dev_alias_file(filp, arg); + case F2FS_IOC_IO_PRIO: + return f2fs_ioc_io_prio(filp, arg); default: return -ENOTTY; } @@ -5147,12 +5228,16 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, } err = generic_fadvise(filp, offset, len, advice); - if (!err && advice == POSIX_FADV_DONTNEED && - test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && - f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + if (err) + return err; - return err; + if (advice == POSIX_FADV_DONTNEED && + (test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode))) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + else if (advice == POSIX_FADV_NOREUSE) + f2fs_keep_noreuse_range(inode, offset, len); + return 0; } #ifdef CONFIG_COMPAT @@ -5261,6 +5346,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DECOMPRESS_FILE: case F2FS_IOC_COMPRESS_FILE: case F2FS_IOC_GET_DEV_ALIAS_FILE: + case F2FS_IOC_IO_PRIO: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index faf9fa1c804d..2b8f9239bede 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1449,14 +1449,14 @@ out: } static int move_data_page(struct inode *inode, block_t bidx, int gc_type, - unsigned int segno, int off) + unsigned int segno, int off) { - struct page *page; + struct folio *folio; int err = 0; - page = f2fs_get_lock_data_page(inode, bidx, true); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = f2fs_get_lock_data_folio(inode, bidx, true); + if (IS_ERR(folio)) + return PTR_ERR(folio); if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { err = -ENOENT; @@ -1468,12 +1468,12 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; if (gc_type == BG_GC) { - if (folio_test_writeback(page_folio(page))) { + if (folio_test_writeback(folio)) { err = -EAGAIN; goto out; } - set_page_dirty(page); - set_page_private_gcing(page); + folio_mark_dirty(folio); + set_page_private_gcing(&folio->page); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1483,37 +1483,37 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, - .page = page, + .page = &folio->page, .encrypted_page = NULL, .need_lock = LOCK_REQ, .io_type = FS_GC_DATA_IO, }; - bool is_dirty = PageDirty(page); + bool is_dirty = folio_test_dirty(folio); retry: - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { + folio_mark_dirty(folio); + if (folio_clear_dirty_for_io(folio)) { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); } - set_page_private_gcing(page); + set_page_private_gcing(&folio->page); err = f2fs_do_write_data_page(&fio); if (err) { - clear_page_private_gcing(page); + clear_page_private_gcing(&folio->page); if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) - set_page_dirty(page); + folio_mark_dirty(folio); } } out: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return err; } @@ -1542,7 +1542,6 @@ next_step: entry = sum; for (off = 0; off < usable_blks_in_seg; off++, entry++) { - struct page *data_page; struct inode *inode; struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; @@ -1585,6 +1584,7 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + struct folio *data_folio; int err; inode = f2fs_iget(sb, dni.ino); @@ -1635,15 +1635,15 @@ next_step: continue; } - data_page = f2fs_get_read_data_page(inode, start_bidx, + data_folio = f2fs_get_read_data_folio(inode, start_bidx, REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (IS_ERR(data_page)) { + if (IS_ERR(data_folio)) { iput(inode); continue; } - f2fs_put_page(data_page, 0); + f2fs_folio_put(data_folio, false); add_gc_inode(gc_list, inode); continue; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3e3c35d4c98b..ad92e9008781 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -119,7 +119,7 @@ int f2fs_read_inline_data(struct inode *inode, struct folio *folio) { struct page *ipage; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { folio_unlock(folio); return PTR_ERR(ipage); @@ -237,7 +237,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_lock_op(sbi); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -265,7 +265,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *ipage; - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -312,7 +312,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -331,7 +331,7 @@ process_inline: } if (f2fs_has_inline_data(inode)) { - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); f2fs_truncate_inline_inode(inode, ipage, 0); @@ -361,7 +361,7 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct page *ipage; void *inline_dentry; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { *res_page = ipage; return NULL; @@ -609,7 +609,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) if (err) goto out; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out_fname; @@ -644,7 +644,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, struct page *page = NULL; int err = 0; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -734,7 +734,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) void *inline_dentry; struct f2fs_dentry_ptr d; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; @@ -765,7 +765,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (ctx->pos == d.max) return 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -797,7 +797,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, struct page *ipage; int err = 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 3dd25f64d6f1..83f862578fc8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -34,10 +34,8 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) if (f2fs_inode_dirtied(inode, sync)) return; - if (f2fs_is_atomic_file(inode)) { - set_inode_flag(inode, FI_ATOMIC_DIRTIED); + if (f2fs_is_atomic_file(inode)) return; - } mark_inode_dirty_sync(inode); } @@ -410,7 +408,7 @@ static int do_read_inode(struct inode *inode) if (f2fs_check_nid_range(sbi, inode->i_ino)) return -EINVAL; - node_page = f2fs_get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -757,7 +755,7 @@ void f2fs_update_inode_page(struct inode *inode) struct page *node_page; int count = 0; retry: - node_page = f2fs_get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); @@ -765,8 +763,12 @@ retry: if (err == -ENOENT) return; + if (err == -EFSCORRUPTED) + goto stop_checkpoint; + if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; +stop_checkpoint: f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); return; } @@ -789,6 +791,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; + /* + * no need to update inode page, ultimately f2fs_evict_inode() will + * clear dirty status of inode. + */ + if (f2fs_cp_error(sbi)) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) { f2fs_mark_inode_dirty_sync(inode, true); return -ENOSPC; @@ -804,6 +813,19 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; } +static void f2fs_remove_donate_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (list_empty(&F2FS_I(inode)->gdonate_list)) + return; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + spin_unlock(&sbi->inode_lock[DONATE_INODE]); +} + /* * Called at the last iput() if i_nlink is zero */ @@ -838,6 +860,7 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); f2fs_remove_dirty_inode(inode); + f2fs_remove_donate_inode(inode); if (!IS_DEVICE_ALIASING(inode)) f2fs_destroy_extent_tree(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a278c7da8177..8f8b9b843bdf 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -502,6 +502,14 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out; } + if (inode->i_nlink == 0) { + f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink", + __func__, inode->i_ino); + err = -EFSCORRUPTED; + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + goto out_iput; + } + if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { @@ -684,23 +692,23 @@ out_free_encrypted_link: return err; } -static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err; if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + return ERR_PTR(-EIO); err = f2fs_dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; @@ -722,12 +730,12 @@ static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, f2fs_sync_fs(sbi->sb, 1); f2fs_balance_fs(sbi, true); - return 0; + return NULL; out_fail: clear_inode_flag(inode, FI_INC_LINK); f2fs_handle_failed_inode(inode); - return err; + return ERR_PTR(err); } static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f88392fc4ba9..5f15c224bf78 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -310,10 +310,10 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page) +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, const struct folio *folio) { - return NODE_MAPPING(sbi) == page->mapping && - IS_DNODE(page) && is_cold_node(page); + return NODE_MAPPING(sbi) == folio->mapping && + IS_DNODE(&folio->page) && is_cold_node(&folio->page); } void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) @@ -778,7 +778,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) npage[0] = dn->inode_page; if (!npage[0]) { - npage[0] = f2fs_get_node_page(sbi, nids[0]); + npage[0] = f2fs_get_inode_page(sbi, nids[0]); if (IS_ERR(npage[0])) return PTR_ERR(npage[0]); } @@ -1130,26 +1130,33 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) unsigned int nofs = 0; struct f2fs_inode *ri; struct dnode_of_data dn; - struct page *page; + struct folio *folio; trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); - if (level < 0) { + if (level <= 0) { + if (!level) { + level = -EFSCORRUPTED; + f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u", + __func__, inode->i_ino, + from, ADDRS_PER_INODE(inode)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } trace_f2fs_truncate_inode_blocks_exit(inode, level); return level; } - page = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); - return PTR_ERR(page); + folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(folio)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(folio)); + return PTR_ERR(folio); } - set_new_dnode(&dn, inode, page, NULL, 0); - unlock_page(page); + set_new_dnode(&dn, inode, &folio->page, NULL, 0); + folio_unlock(folio); - ri = F2FS_INODE(page); + ri = F2FS_INODE(&folio->page); switch (level) { case 0: case 1: @@ -1178,7 +1185,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = get_nid(page, offset[0], true); + dn.nid = get_nid(&folio->page, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1199,7 +1206,7 @@ skip_partial: BUG(); } if (err == -ENOENT) { - set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); f2fs_err_ratelimited(sbi, "truncate node fail, ino:%lu, nid:%u, " @@ -1210,18 +1217,18 @@ skip_partial: } if (err < 0) goto fail; - if (offset[1] == 0 && get_nid(page, offset[0], true)) { - lock_page(page); - BUG_ON(page->mapping != NODE_MAPPING(sbi)); - set_nid(page, offset[0], 0, true); - unlock_page(page); + if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) { + folio_lock(folio); + BUG_ON(folio->mapping != NODE_MAPPING(sbi)); + set_nid(&folio->page, offset[0], 0, true); + folio_unlock(folio); } offset[1] = 0; offset[0]++; nofs += err; } fail: - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); trace_f2fs_truncate_inode_blocks_exit(inode, err); return err > 0 ? 0 : err; } @@ -1238,7 +1245,7 @@ int f2fs_truncate_xattr_node(struct inode *inode) if (!nid) return 0; - npage = f2fs_get_node_page(sbi, nid); + npage = f2fs_get_xnode_page(sbi, nid); if (IS_ERR(npage)) return PTR_ERR(npage); @@ -1449,10 +1456,32 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, - struct page *parent, int start) +static int sanity_check_node_footer(struct f2fs_sb_info *sbi, + struct page *page, pgoff_t nid, + enum node_type ntype) { - struct page *page; + if (unlikely(nid != nid_of_node(page) || + (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || + (ntype == NODE_TYPE_XATTR && + !f2fs_has_xattr_block(ofs_of_node(page))) || + time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { + f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + return -EFSCORRUPTED; + } + return 0; +} + +static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start, + enum node_type ntype) +{ + struct folio *folio; int err; if (!nid) @@ -1460,11 +1489,11 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (f2fs_check_nid_range(sbi, nid)) return ERR_PTR(-EINVAL); repeat: - page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false); + if (IS_ERR(folio)) + return folio; - err = read_node_page(page, 0); + err = read_node_page(&folio->page, 0); if (err < 0) { goto out_put_err; } else if (err == LOCKED_PAGE) { @@ -1475,54 +1504,72 @@ repeat: if (parent) f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { + f2fs_folio_put(folio, true); goto repeat; } - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; goto out_err; } - if (!f2fs_inode_chksum_verify(sbi, page)) { + if (!f2fs_inode_chksum_verify(sbi, &folio->page)) { err = -EFSBADCRC; goto out_err; } page_hit: - if (likely(nid == nid_of_node(page))) - return page; - - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - nid, nid_of_node(page), ino_of_node(page), - ofs_of_node(page), cpver_of_node(page), - next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); - err = -EFSCORRUPTED; + err = sanity_check_node_footer(sbi, &folio->page, nid, ntype); + if (!err) + return folio; out_err: - ClearPageUptodate(page); + folio_clear_uptodate(folio); out_put_err: /* ENOENT comes from read_node_page which is not an error. */ if (err != -ENOENT) - f2fs_handle_page_eio(sbi, page_folio(page), NODE); - f2fs_put_page(page, 1); + f2fs_handle_page_eio(sbi, folio, NODE); + f2fs_folio_put(folio, true); return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - return __get_node_page(sbi, nid, NULL, 0); + struct folio *folio = __get_node_folio(sbi, nid, NULL, 0, + NODE_TYPE_REGULAR); + + return &folio->page; +} + +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) +{ + return __get_node_folio(sbi, ino, NULL, 0, NODE_TYPE_INODE); +} + +struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino) +{ + struct folio *folio = f2fs_get_inode_folio(sbi, ino); + + return &folio->page; +} + +struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid) +{ + struct folio *folio = __get_node_folio(sbi, xnid, NULL, 0, + NODE_TYPE_XATTR); + + return &folio->page; } struct page *f2fs_get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); nid_t nid = get_nid(parent, start, false); + struct folio *folio = __get_node_folio(sbi, nid, parent, start, + NODE_TYPE_REGULAR); - return __get_node_page(sbi, nid, parent, start); + return &folio->page; } static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) @@ -1561,11 +1608,11 @@ iput_out: iput(inode); } -static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) +static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index; struct folio_batch fbatch; - struct page *last_page = NULL; + struct folio *last_folio = NULL; int nr_folios; folio_batch_init(&fbatch); @@ -1577,45 +1624,45 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; if (unlikely(f2fs_cp_error(sbi))) { - f2fs_put_page(last_page, 0); + f2fs_folio_put(last_folio, false); folio_batch_release(&fbatch); return ERR_PTR(-EIO); } - if (!IS_DNODE(page) || !is_cold_node(page)) + if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) continue; - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) continue; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) goto continue_unlock; - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (last_page) - f2fs_put_page(last_page, 0); + if (last_folio) + f2fs_folio_put(last_folio, false); - get_page(page); - last_page = page; - unlock_page(page); + folio_get(folio); + last_folio = folio; + folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); } - return last_page; + return last_folio; } static int __write_node_page(struct page *page, bool atomic, bool *submitted, @@ -1694,7 +1741,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ - if (f2fs_in_warm_node_list(sbi, page)) { + if (f2fs_in_warm_node_list(sbi, folio)) { seq = f2fs_add_fsync_node_entry(sbi, page); if (seq_id) *seq_id = seq; @@ -1769,13 +1816,6 @@ release_page: return err; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - return __write_node_page(page, false, NULL, wbc, false, - FS_NODE_IO, NULL); -} - int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id) @@ -1783,16 +1823,16 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, pgoff_t index; struct folio_batch fbatch; int ret = 0; - struct page *last_page = NULL; + struct folio *last_folio = NULL; bool marked = false; nid_t ino = inode->i_ino; int nr_folios; int nwritten = 0; if (atomic) { - last_page = last_fsync_dnode(sbi, ino); - if (IS_ERR_OR_NULL(last_page)) - return PTR_ERR_OR_ZERO(last_page); + last_folio = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_folio)) + return PTR_ERR_OR_ZERO(last_folio); } retry: folio_batch_init(&fbatch); @@ -1804,73 +1844,73 @@ retry: int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { - f2fs_put_page(last_page, 0); + f2fs_folio_put(last_folio, false); folio_batch_release(&fbatch); ret = -EIO; goto out; } - if (!IS_DNODE(page) || !is_cold_node(page)) + if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) continue; - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) continue; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) goto continue_unlock; - if (!PageDirty(page) && page != last_page) { + if (!folio_test_dirty(folio) && folio != last_folio) { /* someone wrote it for us */ goto continue_unlock; } - f2fs_wait_on_page_writeback(page, NODE, true, true); + f2fs_folio_wait_writeback(folio, NODE, true, true); - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); + set_fsync_mark(&folio->page, 0); + set_dentry_mark(&folio->page, 0); - if (!atomic || page == last_page) { - set_fsync_mark(page, 1); + if (!atomic || folio == last_folio) { + set_fsync_mark(&folio->page, 1); percpu_counter_inc(&sbi->rf_node_block_count); - if (IS_INODE(page)) { + if (IS_INODE(&folio->page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - f2fs_update_inode(inode, page); - set_dentry_mark(page, + f2fs_update_inode(inode, &folio->page); + set_dentry_mark(&folio->page, f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ - if (!PageDirty(page)) - set_page_dirty(page); + if (!folio_test_dirty(folio)) + folio_mark_dirty(folio); } - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - ret = __write_node_page(page, atomic && - page == last_page, + ret = __write_node_page(&folio->page, atomic && + folio == last_folio, &submitted, wbc, true, FS_NODE_IO, seq_id); if (ret) { - unlock_page(page); - f2fs_put_page(last_page, 0); + folio_unlock(folio); + f2fs_folio_put(last_folio, false); break; } else if (submitted) { nwritten++; } - if (page == last_page) { - f2fs_put_page(page, 0); + if (folio == last_folio) { + f2fs_folio_put(folio, false); marked = true; break; } @@ -1883,11 +1923,11 @@ continue_unlock: } if (!ret && atomic && !marked) { f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx", - ino, page_folio(last_page)->index); - lock_page(last_page); - f2fs_wait_on_page_writeback(last_page, NODE, true, true); - set_page_dirty(last_page); - unlock_page(last_page); + ino, last_folio->index); + folio_lock(last_folio); + f2fs_folio_wait_writeback(last_folio, NODE, true, true); + folio_mark_dirty(last_folio); + folio_unlock(last_folio); goto retry; } out: @@ -1920,18 +1960,18 @@ static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) return 1; } -static bool flush_dirty_inode(struct page *page) +static bool flush_dirty_inode(struct folio *folio) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); struct inode *inode; - nid_t ino = ino_of_node(page); + nid_t ino = ino_of_node(&folio->page); inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); if (!inode) return false; - f2fs_update_inode(inode, page); - unlock_page(page); + f2fs_update_inode(inode, &folio->page); + folio_unlock(folio); iput(inode); return true; @@ -1951,32 +1991,27 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; - if (!IS_INODE(page)) + if (!IS_INODE(&folio->page)) continue; - lock_page(page); - - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { -continue_unlock: - unlock_page(page); - continue; - } + folio_lock(folio); - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) + goto unlock; + if (!folio_test_dirty(folio)) + goto unlock; /* flush inline_data, if it's async context. */ - if (page_private_inline(page)) { - clear_page_private_inline(page); - unlock_page(page); - flush_inline_data(sbi, ino_of_node(page)); + if (page_private_inline(&folio->page)) { + clear_page_private_inline(&folio->page); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(&folio->page)); continue; } - unlock_page(page); +unlock: + folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); @@ -2005,7 +2040,7 @@ next_step: int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; bool submitted = false; /* give a priority to WB_SYNC threads */ @@ -2021,27 +2056,27 @@ next_step: * 1. dentry dnodes * 2. file dnodes */ - if (step == 0 && IS_DNODE(page)) + if (step == 0 && IS_DNODE(&folio->page)) continue; - if (step == 1 && (!IS_DNODE(page) || - is_cold_node(page))) + if (step == 1 && (!IS_DNODE(&folio->page) || + is_cold_node(&folio->page))) continue; - if (step == 2 && (!IS_DNODE(page) || - !is_cold_node(page))) + if (step == 2 && (!IS_DNODE(&folio->page) || + !is_cold_node(&folio->page))) continue; lock_node: if (wbc->sync_mode == WB_SYNC_ALL) - lock_page(page); - else if (!trylock_page(page)) + folio_lock(folio); + else if (!folio_trylock(folio)) continue; - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } @@ -2051,29 +2086,29 @@ continue_unlock: goto write_node; /* flush inline_data */ - if (page_private_inline(page)) { - clear_page_private_inline(page); - unlock_page(page); - flush_inline_data(sbi, ino_of_node(page)); + if (page_private_inline(&folio->page)) { + clear_page_private_inline(&folio->page); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(&folio->page)); goto lock_node; } /* flush dirty inode */ - if (IS_INODE(page) && flush_dirty_inode(page)) + if (IS_INODE(&folio->page) && flush_dirty_inode(folio)) goto lock_node; write_node: - f2fs_wait_on_page_writeback(page, NODE, true, true); + f2fs_folio_wait_writeback(folio, NODE, true, true); - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); + set_fsync_mark(&folio->page, 0); + set_dentry_mark(&folio->page, 0); - ret = __write_node_page(page, false, &submitted, + ret = __write_node_page(&folio->page, false, &submitted, wbc, do_balance, io_type, NULL); if (ret) - unlock_page(page); + folio_unlock(folio); else if (submitted) nwritten++; @@ -2207,7 +2242,6 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, * Structure of the f2fs node operations */ const struct address_space_operations f2fs_node_aops = { - .writepage = f2fs_write_node_page, .writepages = f2fs_write_node_pages, .dirty_folio = f2fs_dirty_node_folio, .invalidate_folio = f2fs_invalidate_folio, @@ -2269,24 +2303,6 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int i; - bool ret = true; - - f2fs_down_read(&nm_i->nat_tree_lock); - for (i = 0; i < nm_i->nat_blocks; i++) { - if (!test_bit_le(i, nm_i->nat_block_bitmap)) { - ret = false; - break; - } - } - f2fs_up_read(&nm_i->nat_tree_lock); - - return ret; -} - static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -2717,7 +2733,7 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) struct page *ipage; struct f2fs_inode *ri; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -2965,23 +2981,7 @@ add_out: list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, - unsigned int valid) -{ - if (valid == 0) { - __set_bit_le(nat_ofs, nm_i->empty_nat_bits); - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); - return; - } - - __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_ofs, nm_i->full_nat_bits); - else - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); -} - -static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2990,7 +2990,7 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; if (nat_index == 0) { @@ -3001,36 +3001,17 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - - __update_nat_bits(nm_i, nat_index, valid); -} - -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int nat_ofs; - - f2fs_down_read(&nm_i->nat_tree_lock); - - for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { - unsigned int valid = 0, nid_ofs = 0; - - /* handle nid zero due to it should never be used */ - if (unlikely(nat_ofs == 0)) { - valid = 1; - nid_ofs = 1; - } - - for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { - if (!test_bit_le(nid_ofs, - nm_i->free_nid_bitmap[nat_ofs])) - valid++; - } - - __update_nat_bits(nm_i, nat_ofs, valid); + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; } - f2fs_up_read(&nm_i->nat_tree_lock); + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -3049,7 +3030,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if ((cpc->reason & CP_UMOUNT) || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -3096,7 +3077,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - update_nat_bits(sbi, start_nid, page); + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -3127,7 +3108,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * during unmount, let's flush nat_bits before checking * nat_cnt[DIRTY_NAT]. */ - if (cpc->reason & CP_UMOUNT) { + if (enabled_nat_bits(sbi, cpc)) { f2fs_down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); f2fs_up_write(&nm_i->nat_tree_lock); @@ -3143,7 +3124,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (cpc->reason & CP_UMOUNT || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3180,18 +3161,15 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; + if (!enabled_nat_bits(sbi, NULL)) + return 0; + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kvzalloc(sbi, F2FS_BLK_TO_BYTES(nm_i->nat_bits_blocks), GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) - return 0; - nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -3208,12 +3186,13 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", - cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); + disable_nat_bits(sbi, true); return 0; } + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -3224,7 +3203,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; for (i = 0; i < nm_i->nat_blocks; i++) { @@ -3296,6 +3275,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi) if (!nm_i->nat_bitmap) return -ENOMEM; + if (!test_opt(sbi, NAT_BITS)) + disable_nat_bits(sbi, true); + err = __get_nat_bitmaps(sbi); if (err) return err; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 6aea13024ac1..103a437e6425 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -52,6 +52,13 @@ enum { IS_PREALLOC, /* nat entry is preallocated */ }; +/* For node type in __get_node_folio() */ +enum node_type { + NODE_TYPE_REGULAR, + NODE_TYPE_INODE, + NODE_TYPE_XATTR, +}; + /* * For node information */ @@ -248,7 +255,7 @@ static inline nid_t nid_of_node(struct page *node_page) return le32_to_cpu(rn->footer.nid); } -static inline unsigned int ofs_of_node(struct page *node_page) +static inline unsigned int ofs_of_node(const struct page *node_page) { struct f2fs_node *rn = F2FS_NODE(node_page); unsigned flag = le32_to_cpu(rn->footer.flag); @@ -342,7 +349,7 @@ static inline bool is_recoverable_dnode(struct page *page) * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) * `- direct node */ -static inline bool IS_DNODE(struct page *node_page) +static inline bool IS_DNODE(const struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); @@ -389,7 +396,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold data pages in page cache */ -static inline int is_node(struct page *page, int type) +static inline int is_node(const struct page *page, int type) { struct f2fs_node *rn = F2FS_NODE(page); return le32_to_cpu(rn->footer.flag) & BIT(type); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c282e8a0a2ec..396ef71f41e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2096,7 +2096,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, return false; if (!force) { - if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || + if (!f2fs_realtime_discard_enable(sbi) || + (!se->valid_blocks && + !IS_CURSEG(sbi, cpc->trim_start)) || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -2320,10 +2322,9 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; - if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT || + F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) dcc->discard_granularity = BLKS_PER_SEG(sbi); - else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) - dcc->discard_granularity = BLKS_PER_SEC(sbi); INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) @@ -2806,7 +2807,7 @@ find_other_zone: MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; - f2fs_bug_on(sbi, 1); + f2fs_bug_on(sbi, !pinning); goto out_unlock; } } @@ -2848,7 +2849,7 @@ got_it: out_unlock: spin_unlock(&free_i->segmap_lock); - if (ret == -ENOSPC) + if (ret == -ENOSPC && !pinning) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); return ret; } @@ -2921,6 +2922,13 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) return curseg->segno; } +static void reset_curseg_fields(struct curseg_info *curseg) +{ + curseg->inited = false; + curseg->segno = NULL_SEGNO; + curseg->next_segno = 0; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -2939,7 +2947,7 @@ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) ret = get_new_segment(sbi, &segno, new_sec, pinning); if (ret) { if (ret == -ENOSPC) - curseg->segno = NULL_SEGNO; + reset_curseg_fields(curseg); return ret; } @@ -3710,13 +3718,6 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, get_random_u32_inclusive(1, sbi->max_fragment_hole); } -static void reset_curseg_fields(struct curseg_info *curseg) -{ - curseg->inited = false; - curseg->segno = NULL_SEGNO; - curseg->next_segno = 0; -} - int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, @@ -3902,6 +3903,7 @@ static int log_type_to_seg_type(enum log_type type) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { + struct folio *folio = page_folio(fio->page); enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && @@ -3912,10 +3914,10 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { - if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host)) + if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); - end_page_writeback(fio->page); - if (f2fs_in_warm_node_list(fio->sbi, fio->page)) + folio_end_writeback(folio); + if (f2fs_in_warm_node_list(fio->sbi, folio)) f2fs_del_fsync_node_entry(fio->sbi, fio->page); goto out; } @@ -4154,22 +4156,21 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, f2fs_update_data_blkaddr(dn, new_addr); } -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered, bool locked) +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked) { - if (folio_test_writeback(page_folio(page))) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); + if (folio_test_writeback(folio)) { + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ - f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); + f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type); /* submit cached IPU IO */ - f2fs_submit_merged_ipu_write(sbi, NULL, page); + f2fs_submit_merged_ipu_write(sbi, NULL, &folio->page); if (ordered) { - wait_on_page_writeback(page); - f2fs_bug_on(sbi, locked && - folio_test_writeback(page_folio(page))); + folio_wait_writeback(folio); + f2fs_bug_on(sbi, locked && folio_test_writeback(folio)); } else { - wait_for_stable_page(page); + folio_wait_stable(folio); } } } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 943be4f1d6d2..0465dc00b349 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -559,13 +559,16 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int data_blocks, unsigned int dent_blocks) { - unsigned int segno, left_blocks, blocks; int i; /* check current data/node sections in the worst case. */ for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { segno = CURSEG_I(sbi, i)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); @@ -576,6 +579,10 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, /* check current data section for dentry blocks. */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); if (dent_blocks > left_blocks) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 83d6fb97dcae..9c8d3aee89af 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -73,7 +73,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, mutex_unlock(&sbi->umount_mutex); } spin_unlock(&f2fs_list_lock); - return count; + return count ?: SHRINK_EMPTY; } unsigned long f2fs_shrink_scan(struct shrinker *shrink, @@ -130,6 +130,96 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, return freed; } +unsigned int f2fs_donate_files(void) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int donate_files = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + donate_files += sbi->donate_files; + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + + return donate_files; +} + +static unsigned int do_reclaim_caches(struct f2fs_sb_info *sbi, + unsigned int reclaim_caches_kb) +{ + struct inode *inode; + struct f2fs_inode_info *fi; + unsigned int nfiles = sbi->donate_files; + pgoff_t npages = reclaim_caches_kb >> (PAGE_SHIFT - 10); + + while (npages && nfiles--) { + pgoff_t len; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + len = fi->donate_end - fi->donate_start + 1; + npages = npages < len ? 0 : npages - len; + invalidate_inode_pages2_range(inode->i_mapping, + fi->donate_start, fi->donate_end); + iput(inode); + cond_resched(); + } + return npages << (PAGE_SHIFT - 10); +} + +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list && reclaim_caches_kb) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + reclaim_caches_kb = do_reclaim_caches(sbi, reclaim_caches_kb); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); +} + void f2fs_join_shrinker(struct f2fs_sb_info *sbi) { spin_lock(&f2fs_list_lock); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 19b67828ae32..f087b2b71c89 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -63,6 +63,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_BLKADDR_VALIDITY] = "invalid blkaddr", [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", [FAULT_NO_SEGMENT] = "no free segment", + [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer", }; int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, @@ -190,6 +191,7 @@ enum { Opt_memory_mode, Opt_age_extent_cache, Opt_errors, + Opt_nat_bits, Opt_err, }; @@ -269,6 +271,7 @@ static match_table_t f2fs_tokens = { {Opt_memory_mode, "memory=%s"}, {Opt_age_extent_cache, "age_extent_cache"}, {Opt_errors, "errors=%s"}, + {Opt_nat_bits, "nat_bits"}, {Opt_err, NULL}, }; @@ -383,10 +386,10 @@ static void init_once(void *foo) #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) -static int f2fs_set_qf_name(struct super_block *sb, int qtype, +static int f2fs_set_qf_name(struct f2fs_sb_info *sbi, int qtype, substring_t *args) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct super_block *sb = sbi->sb; char *qname; int ret = -EINVAL; @@ -424,9 +427,9 @@ errout: return ret; } -static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +static int f2fs_clear_qf_name(struct f2fs_sb_info *sbi, int qtype) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct super_block *sb = sbi->sb; if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); @@ -483,12 +486,11 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int f2fs_set_test_dummy_encryption(struct super_block *sb, +static int f2fs_set_test_dummy_encryption(struct f2fs_sb_info *sbi, const char *opt, const substring_t *arg, bool is_remount) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); struct fs_parameter param = { .type = fs_value_is_string, .string = arg->from ? arg->from : "", @@ -671,9 +673,8 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) #endif #endif -static int parse_options(struct super_block *sb, char *options, bool is_remount) +static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remount) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; @@ -687,7 +688,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) int ret; if (!options) - goto default_check; + return 0; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -728,10 +729,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, DISABLE_ROLL_FORWARD); break; case Opt_norecovery: - /* this option mounts f2fs with ro */ + /* requires ro mount, checked in f2fs_default_check */ set_opt(sbi, NORECOVERY); - if (!f2fs_readonly(sb)) - return -EINVAL; break; case Opt_discard: if (!f2fs_hw_support_discard(sbi)) { @@ -772,16 +771,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_user_xattr: - f2fs_info(sbi, "user_xattr options not supported"); - break; case Opt_nouser_xattr: - f2fs_info(sbi, "nouser_xattr options not supported"); - break; case Opt_inline_xattr: - f2fs_info(sbi, "inline_xattr options not supported"); - break; case Opt_noinline_xattr: - f2fs_info(sbi, "noinline_xattr options not supported"); + case Opt_inline_xattr_size: + f2fs_info(sbi, "xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -793,10 +787,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_acl: - f2fs_info(sbi, "acl options not supported"); - break; case Opt_noacl: - f2fs_info(sbi, "noacl options not supported"); + f2fs_info(sbi, "acl options not supported"); break; #endif case Opt_active_logs: @@ -838,7 +830,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) { + if (f2fs_sb_has_device_alias(sbi)) { f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } @@ -919,18 +911,15 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_fault_injection: - f2fs_info(sbi, "fault_injection options not supported"); - break; - case Opt_fault_type: - f2fs_info(sbi, "fault_type options not supported"); + f2fs_info(sbi, "fault injection options not supported"); break; #endif case Opt_lazytime: - sb->s_flags |= SB_LAZYTIME; + set_opt(sbi, LAZYTIME); break; case Opt_nolazytime: - sb->s_flags &= ~SB_LAZYTIME; + clear_opt(sbi, LAZYTIME); break; #ifdef CONFIG_QUOTA case Opt_quota: @@ -944,32 +933,32 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, PRJQUOTA); break; case Opt_usrjquota: - ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, USRQUOTA, &args[0]); if (ret) return ret; break; case Opt_grpjquota: - ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, GRPQUOTA, &args[0]); if (ret) return ret; break; case Opt_prjjquota: - ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, PRJQUOTA, &args[0]); if (ret) return ret; break; case Opt_offusrjquota: - ret = f2fs_clear_qf_name(sb, USRQUOTA); + ret = f2fs_clear_qf_name(sbi, USRQUOTA); if (ret) return ret; break; case Opt_offgrpjquota: - ret = f2fs_clear_qf_name(sb, GRPQUOTA); + ret = f2fs_clear_qf_name(sbi, GRPQUOTA); if (ret) return ret; break; case Opt_offprjjquota: - ret = f2fs_clear_qf_name(sb, PRJQUOTA); + ret = f2fs_clear_qf_name(sbi, PRJQUOTA); if (ret) return ret; break; @@ -1039,14 +1028,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) kfree(name); break; case Opt_test_dummy_encryption: - ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + ret = f2fs_set_test_dummy_encryption(sbi, p, &args[0], is_remount); if (ret) return ret; break; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - sb->s_flags |= SB_INLINECRYPT; + set_opt(sbi, INLINECRYPT); #else f2fs_info(sbi, "inline encryption not supported"); #endif @@ -1322,13 +1311,20 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_nat_bits: + set_opt(sbi, NAT_BITS); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); return -EINVAL; } } -default_check: + return 0; +} + +static int f2fs_default_check(struct f2fs_sb_info *sbi) +{ #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; @@ -1418,6 +1414,12 @@ default_check: f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; } + + if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "norecovery requires readonly mount"); + return -EINVAL; + } + return 0; } @@ -1441,6 +1443,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->gdonate_list); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); @@ -1527,6 +1530,10 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync) inc_page_count(sbi, F2FS_DIRTY_IMETA); } spin_unlock(&sbi->inode_lock[DIRTY_META]); + + if (!ret && f2fs_is_atomic_file(inode)) + set_inode_flag(inode, FI_ATOMIC_DIRTIED); + return ret; } @@ -1737,22 +1744,28 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + if (f2fs_readonly(sb)) return 0; /* IO error happened before */ - if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + if (unlikely(f2fs_cp_error(sbi))) return -EIO; /* must be clean, since sync_filesystem() was already called */ - if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY)) return -EINVAL; + sbi->umount_lock_holder = current; + /* Let's flush checkpoints and stop the thread. */ - f2fs_flush_ckpt_thread(F2FS_SB(sb)); + f2fs_flush_ckpt_thread(sbi); + + sbi->umount_lock_holder = NULL; /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ - set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); + set_sbi_flag(sbi, SBI_IS_FREEZING); return 0; } @@ -1836,7 +1849,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; spin_lock(&sbi->stat_lock); - + if (sbi->carve_out) + buf->f_blocks -= sbi->current_reserved_blocks; user_block_count = sbi->user_block_count; total_valid_node_count = valid_node_count(sbi); avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -2128,6 +2142,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC) seq_printf(seq, ",errors=%s", "panic"); + if (test_opt(sbi, NAT_BITS)) + seq_puts(seq, ",nat_bits"); + return 0; } @@ -2175,8 +2192,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, MERGE_CHECKPOINT); + set_opt(sbi, LAZYTIME); F2FS_OPTION(sbi).unusable_cap = 0; - sbi->sb->s_flags |= SB_LAZYTIME; if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_has_blkzoned(sbi)) @@ -2318,6 +2335,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); + bool no_nat_bits = !test_opt(sbi, NAT_BITS); #ifdef CONFIG_QUOTA int i, j; #endif @@ -2329,6 +2347,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; old_sb_flags = sb->s_flags; + sbi->umount_lock_holder = current; + #ifdef CONFIG_QUOTA org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { @@ -2359,7 +2379,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi, true); /* parse mount options */ - err = parse_options(sb, data, true); + err = parse_options(sbi, data, true); if (err) goto restore_opts; @@ -2374,6 +2394,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } #endif + err = f2fs_default_check(sbi); + if (err) + goto restore_opts; + /* flush outstanding errors before changing fs state */ flush_work(&sbi->s_error_work); @@ -2444,6 +2468,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (no_nat_bits == !!test_opt(sbi, NAT_BITS)) { + err = -EINVAL; + f2fs_warn(sbi, "switch nat_bits option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -2552,6 +2582,8 @@ skip: limit_reserve_root(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + + sbi->umount_lock_holder = NULL; return 0; restore_checkpoint: if (need_enable_checkpoint) { @@ -2592,6 +2624,8 @@ restore_opts: #endif sbi->mount_opt = org_mount_opt; sb->s_flags = old_sb_flags; + + sbi->umount_lock_holder = NULL; return err; } @@ -2908,7 +2942,7 @@ out: return ret; } -int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_do_quota_sync(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); @@ -2956,11 +2990,21 @@ int f2fs_quota_sync(struct super_block *sb, int type) return ret; } +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + int ret; + + F2FS_SB(sb)->umount_lock_holder = current; + ret = f2fs_do_quota_sync(sb, type); + F2FS_SB(sb)->umount_lock_holder = NULL; + return ret; +} + static int f2fs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { struct inode *inode; - int err; + int err = 0; /* if quota sysfile exists, deny enabling quota with specific file */ if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) { @@ -2971,31 +3015,34 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) return -EXDEV; - err = f2fs_quota_sync(sb, type); + F2FS_SB(sb)->umount_lock_holder = current; + + err = f2fs_do_quota_sync(sb, type); if (err) - return err; + goto out; inode = d_inode(path->dentry); err = filemap_fdatawrite(inode->i_mapping); if (err) - return err; + goto out; err = filemap_fdatawait(inode->i_mapping); if (err) - return err; + goto out; err = dquot_quota_on(sb, type, format_id, path); if (err) - return err; + goto out; inode_lock(inode); F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); - - return 0; +out: + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; } static int __f2fs_quota_off(struct super_block *sb, int type) @@ -3006,7 +3053,7 @@ static int __f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - err = f2fs_quota_sync(sb, type); + err = f2fs_do_quota_sync(sb, type); if (err) goto out_put; @@ -3029,6 +3076,8 @@ static int f2fs_quota_off(struct super_block *sb, int type) struct f2fs_sb_info *sbi = F2FS_SB(sb); int err; + F2FS_SB(sb)->umount_lock_holder = current; + err = __f2fs_quota_off(sb, type); /* @@ -3038,6 +3087,9 @@ static int f2fs_quota_off(struct super_block *sb, int type) */ if (is_journalled_quota(sbi)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; } @@ -3170,7 +3222,7 @@ int f2fs_dquot_initialize(struct inode *inode) return 0; } -int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_do_quota_sync(struct super_block *sb, int type) { return 0; } @@ -4220,6 +4272,8 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason) if (shutdown) set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + else + dump_stack(); /* * Continue filesystem operators if errors=continue. Should not set @@ -4495,7 +4549,11 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options, false); + err = parse_options(sbi, options, false); + if (err) + goto free_options; + + err = f2fs_default_check(sbi); if (err) goto free_options; @@ -4533,6 +4591,14 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + if (test_opt(sbi, INLINECRYPT)) + sb->s_flags |= SB_INLINECRYPT; + + if (test_opt(sbi, LAZYTIME)) + sb->s_flags |= SB_LAZYTIME; + else + sb->s_flags &= ~SB_LAZYTIME; + super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid)); super_set_sysfs_name_bdev(sb); sb->s_iflags |= SB_I_CGROUPWB; @@ -4703,6 +4769,7 @@ try_onemore: if (err) goto free_compress_inode; + sbi->umount_lock_holder = current; #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { @@ -4718,8 +4785,10 @@ try_onemore: if (err) goto free_meta; - if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) + if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) { + skip_recovery = true; goto reset_checkpoint; + } /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && @@ -4769,10 +4838,10 @@ try_onemore: } } +reset_checkpoint: #ifdef CONFIG_QUOTA f2fs_recover_quota_end(sbi, quota_enabled); #endif -reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, * write pointer consistency of cursegs and other zones are already @@ -4829,6 +4898,8 @@ reset_checkpoint: f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + + sbi->umount_lock_holder = NULL; return 0; sync_free_meta: @@ -4931,6 +5002,8 @@ static void kill_f2fs_super(struct super_block *sb) struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sb->s_root) { + sbi->umount_lock_holder = current; + set_sbi_flag(sbi, SBI_IS_CLOSE); f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d15c68b28952..c69161366467 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -61,6 +61,12 @@ struct f2fs_attr { int id; }; +struct f2fs_base_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_base_attr *a, char *buf); + ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len); +}; + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); @@ -862,6 +868,25 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +static ssize_t f2fs_base_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->show ? a->show(a, buf) : 0; +} + +static ssize_t f2fs_base_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->store ? a->store(a, buf, len) : 0; +} + /* * Note that there are three feature list entries: * 1) /sys/fs/f2fs/features @@ -880,18 +905,50 @@ static void f2fs_sb_release(struct kobject *kobj) * please add new on-disk feature in this list only. * - ref. F2FS_SB_FEATURE_RO_ATTR() */ -static ssize_t f2fs_feature_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) +static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf) { return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ -static struct f2fs_attr f2fs_attr_##_name = { \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ .show = f2fs_feature_show, \ } +static ssize_t f2fs_tune_show(struct f2fs_base_attr *a, char *buf) +{ + unsigned int res = 0; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + res = f2fs_donate_files(); + + return sysfs_emit(buf, "%u\n", res); +} + +static ssize_t f2fs_tune_store(struct f2fs_base_attr *a, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + f2fs_reclaim_caches(t); + + return count; +} + +#define F2FS_TUNE_RW_ATTR(_name) \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0644 }, \ + .show = f2fs_tune_show, \ + .store = f2fs_tune_store, \ +} + static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -1065,6 +1122,7 @@ F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif +F2FS_SBI_GENERAL_RW_ATTR(carve_out); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1252,41 +1310,43 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(warm_data_age_threshold), ATTR_LIST(last_age_weight), ATTR_LIST(max_read_extent_count), + ATTR_LIST(carve_out), NULL, }; ATTRIBUTE_GROUPS(f2fs); +#define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr) static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION - ATTR_LIST(encryption), - ATTR_LIST(test_dummy_encryption_v2), + BASE_ATTR_LIST(encryption), + BASE_ATTR_LIST(test_dummy_encryption_v2), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(encrypted_casefold), + BASE_ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED - ATTR_LIST(block_zoned), + BASE_ATTR_LIST(block_zoned), #endif - ATTR_LIST(atomic_write), - ATTR_LIST(extra_attr), - ATTR_LIST(project_quota), - ATTR_LIST(inode_checksum), - ATTR_LIST(flexible_inline_xattr), - ATTR_LIST(quota_ino), - ATTR_LIST(inode_crtime), - ATTR_LIST(lost_found), + BASE_ATTR_LIST(atomic_write), + BASE_ATTR_LIST(extra_attr), + BASE_ATTR_LIST(project_quota), + BASE_ATTR_LIST(inode_checksum), + BASE_ATTR_LIST(flexible_inline_xattr), + BASE_ATTR_LIST(quota_ino), + BASE_ATTR_LIST(inode_crtime), + BASE_ATTR_LIST(lost_found), #ifdef CONFIG_FS_VERITY - ATTR_LIST(verity), + BASE_ATTR_LIST(verity), #endif - ATTR_LIST(sb_checksum), + BASE_ATTR_LIST(sb_checksum), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(casefold), + BASE_ATTR_LIST(casefold), #endif - ATTR_LIST(readonly), + BASE_ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION - ATTR_LIST(compression), + BASE_ATTR_LIST(compression), #endif - ATTR_LIST(pin_file), + BASE_ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -1343,6 +1403,14 @@ static struct attribute *f2fs_sb_feat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_sb_feat); +F2FS_TUNE_RW_ATTR(reclaim_caches_kb); + +static struct attribute *f2fs_tune_attrs[] = { + BASE_ATTR_LIST(reclaim_caches_kb), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_tune); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -1362,15 +1430,34 @@ static struct kset f2fs_kset = { .kobj = {.ktype = &f2fs_ktype}, }; +static const struct sysfs_ops f2fs_feat_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + static const struct kobj_type f2fs_feat_ktype = { .default_groups = f2fs_feat_groups, - .sysfs_ops = &f2fs_attr_ops, + .sysfs_ops = &f2fs_feat_attr_ops, }; static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static const struct sysfs_ops f2fs_tune_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + +static const struct kobj_type f2fs_tune_ktype = { + .default_groups = f2fs_tune_groups, + .sysfs_ops = &f2fs_tune_attr_ops, +}; + +static struct kobject f2fs_tune = { + .kset = &f2fs_kset, +}; + static ssize_t f2fs_stat_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1607,6 +1694,11 @@ int __init f2fs_init_sysfs(void) if (ret) goto put_kobject; + ret = kobject_init_and_add(&f2fs_tune, &f2fs_tune_ktype, + NULL, "tuning"); + if (ret) + goto put_kobject; + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); if (!f2fs_proc_root) { ret = -ENOMEM; @@ -1614,7 +1706,9 @@ int __init f2fs_init_sysfs(void) } return 0; + put_kobject: + kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); return ret; @@ -1622,6 +1716,7 @@ put_kobject: void f2fs_exit_sysfs(void) { + kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3f3874943679..c691b35618ad 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -282,7 +282,7 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - page = f2fs_get_node_page(sbi, inode->i_ino); + page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -303,7 +303,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = f2fs_get_node_page(sbi, xnid); + xpage = f2fs_get_xnode_page(sbi, xnid); if (IS_ERR(xpage)) return PTR_ERR(xpage); @@ -449,7 +449,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - in_page = f2fs_get_node_page(sbi, inode->i_ino); + in_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(in_page)) { f2fs_alloc_nid_failed(sbi, new_nid); return PTR_ERR(in_page); @@ -475,7 +475,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to xattr node block */ if (F2FS_I(inode)->i_xattr_nid) { - xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = f2fs_get_xnode_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); f2fs_alloc_nid_failed(sbi, new_nid); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index f06f6ba643cc..23e9b9371ec3 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -339,8 +339,8 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; @@ -389,13 +389,13 @@ static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, mutex_unlock(&MSDOS_SB(sb)->s_lock); fat_flush_inodes(sb, dir, inode); - return 0; + return NULL; out_free: fat_free_clusters(dir, cluster); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); - return err; + return ERR_PTR(err); } /***** Unlink a file */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 926c26e90ef8..dd910edd2404 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -841,8 +841,8 @@ out: return err; } -static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -877,13 +877,13 @@ static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); mutex_unlock(&MSDOS_SB(sb)->s_lock); - return 0; + return NULL; out_free: fat_free_clusters(dir, cluster); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); - return err; + return ERR_PTR(err); } static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh, diff --git a/fs/file.c b/fs/file.c index d868cdb95d1e..dc3f7e120e3e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -26,6 +26,28 @@ #include "internal.h" +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) +{ + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} + /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count @@ -67,24 +89,7 @@ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) return true; } - /* - * If the reference count was already in the dead zone, then this - * put() operation is imbalanced. Warn, put the reference count back to - * DEAD and tell the caller to not deconstruct the object. - */ - if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { - atomic_long_set(&ref->refcnt, FILE_REF_DEAD); - return false; - } - - /* - * This is a put() operation on a saturated refcount. Restore the - * mean saturation value and tell the caller to not deconstruct the - * object. - */ - if (cnt > FILE_REF_MAXREF) - atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); - return false; + return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); @@ -418,17 +423,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho old_fds = old_fdt->fd; new_fds = new_fdt->fd; + /* + * We may be racing against fd allocation from other threads using this + * files_struct, despite holding ->file_lock. + * + * alloc_fd() might have already claimed a slot, while fd_install() + * did not populate it yet. Note the latter operates locklessly, so + * the file can show up as we are walking the array below. + * + * At the same time we know no files will disappear as all other + * operations take the lock. + * + * Instead of trying to placate userspace racing with itself, we + * ref the file if we see it and mark the fd slot as unused otherwise. + */ for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; + struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); @@ -577,6 +590,7 @@ repeat: __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); @@ -612,22 +626,14 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); -/* - * Install a file pointer in the fd array. - * - * The VFS is full of places where we drop the files lock between - * setting the open_fds bitmap and installing the file in the file - * array. At any such point, we are vulnerable to a dup2() race - * installing a file in the array before us. We need to detect this and - * fput() the struct file we are about to overwrite in this case. - * - * It should never happen - if we allow dup2() do it, _really_ bad things - * will follow. +/** + * fd_install - install a file pointer in the fd array + * @fd: file descriptor to install the file in + * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ - void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; @@ -642,7 +648,7 @@ void fd_install(unsigned int fd, struct file *file) rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); - WARN_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; @@ -650,7 +656,7 @@ void fd_install(unsigned int fd, struct file *file) /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); - BUG_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } @@ -679,7 +685,7 @@ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) return NULL; fd = array_index_nospec(fd, fdt->max_fds); - file = fdt->fd[fd]; + file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); @@ -1178,8 +1184,23 @@ struct fd fdget_raw(unsigned int fd) */ static inline bool file_needs_f_pos_lock(struct file *file) { - return (file->f_mode & FMODE_ATOMIC_POS) && - (file_count(file) > 1 || file->f_op->iterate_shared); + if (!(file->f_mode & FMODE_ATOMIC_POS)) + return false; + if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) + return true; + if (file->f_op->iterate_shared) + return true; + return false; +} + +bool file_seek_cur_needs_f_lock(struct file *file) +{ + if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) + return false; + + VFS_WARN_ON_ONCE((file_count(file) > 1) && + !mutex_is_locked(&file->f_pos_lock)); + return true; } struct fd fdget_pos(unsigned int fd) @@ -1187,7 +1208,7 @@ struct fd fdget_pos(unsigned int fd) struct fd f = fdget(fd); struct file *file = fd_file(f); - if (file && file_needs_f_pos_lock(file)) { + if (likely(file) && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } @@ -1230,14 +1251,34 @@ __releases(&files->file_lock) struct fdtable *fdt; /* - * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. + * dup2() is expected to close the file installed in the target fd slot + * (if any). However, userspace hand-picking a fd may be racing against + * its own threads which happened to allocate it in open() et al but did + * not populate it yet. + * + * Broadly speaking we may be racing against the following: + * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL + * file = hard_work_goes_here(); + * fd_install(fd, file); // only now ->fd[fd] == file + * + * It is an invariant that a successfully allocated fd has a NULL entry + * in the array until the matching fd_install(). + * + * If we fit the window, we have the fd to populate, yet no target file + * to close. Trying to ignore it and install our new file would violate + * the invariant and make fd_install() overwrite our file. + * + * Things can be done(tm) to handle this. However, the issue does not + * concern legitimate programs and we only need to make sure the kernel + * does not trip over it. + * + * The simplest way out is to return an error if we find ourselves here. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); - tofree = fdt->fd[fd]; + tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); diff --git a/fs/file_table.c b/fs/file_table.c index 5c00dc38558d..c04ed94cdc4b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -221,7 +221,8 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (unlikely(get_nr_files() >= files_stat.max_files) && + !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. @@ -511,31 +512,37 @@ void flush_delayed_fput(void) } EXPORT_SYMBOL_GPL(flush_delayed_fput); -void fput(struct file *file) +static void __fput_deferred(struct file *file) { - if (file_ref_put(&file->f_ref)) { - struct task_struct *task = current; + struct task_struct *task = current; + + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } - if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { - file_free(file); + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; - } - if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_task_work, ____fput); - if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) - return; - /* - * After this task has run exit_task_work(), - * task_work_add() will fail. Fall through to delayed - * fput to avoid leaking *file. - */ - } - - if (llist_add(&file->f_llist, &delayed_fput_list)) - schedule_delayed_work(&delayed_fput_work, 1); + /* + * After this task has run exit_task_work(), + * task_work_add() will fail. Fall through to delayed + * fput to avoid leaking *file. + */ } + + if (llist_add(&file->f_llist, &delayed_fput_list)) + schedule_delayed_work(&delayed_fput_work, 1); } +void fput(struct file *file) +{ + if (unlikely(file_ref_put(&file->f_ref))) + __fput_deferred(file); +} +EXPORT_SYMBOL(fput); + /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without @@ -549,10 +556,32 @@ void __fput_sync(struct file *file) if (file_ref_put(&file->f_ref)) __fput(file); } - -EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); +/* + * Equivalent to __fput_sync(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close_sync(struct file *file) +{ + if (likely(file_ref_put_close(&file->f_ref))) + __fput(file); +} + +/* + * Equivalent to fput(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close(struct file *file) +{ + if (file_ref_put_close(&file->f_ref)) + __fput_deferred(file); +} + void __init files_init(void) { struct kmem_cache_args args = { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3cd99e2dc6ac..cc57367fb641 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -65,7 +65,7 @@ struct wb_writeback_work { * timestamps written to disk after 12 hours, but in the worst case a * few inodes might not their timestamps updated for 24 hours. */ -unsigned int dirtytime_expire_interval = 12 * 60 * 60; +static unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { @@ -2435,14 +2435,7 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); } -static int __init start_dirtytime_writeback(void) -{ - schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); - return 0; -} -__initcall(start_dirtytime_writeback); - -int dirtytime_interval_handler(const struct ctl_table *table, int write, +static int dirtytime_interval_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2453,6 +2446,25 @@ int dirtytime_interval_handler(const struct ctl_table *table, int write, return ret; } +static const struct ctl_table vm_fs_writeback_table[] = { + { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirtytime_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = SYSCTL_ZERO, + }, +}; + +static int __init start_dirtytime_writeback(void) +{ + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); + register_sysctl_init("vm", vm_fs_writeback_table); + return 0; +} +__initcall(start_dirtytime_writeback); + /** * __mark_inode_dirty - internal function to mark an inode dirty * diff --git a/fs/fsopen.c b/fs/fsopen.c index 094a7f510edf..1aaf4cb2afb2 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -453,7 +453,7 @@ SYSCALL_DEFINE5(fsconfig, case FSCONFIG_SET_FD: param.type = fs_value_is_file; ret = -EBADF; - param.file = fget(aux); + param.file = fget_raw(aux); if (!param.file) goto out_key; param.dirfd = aux; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3805f9b06c9d..fa8f1141ea74 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -781,9 +781,9 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, - struct fuse_args *args, struct inode *dir, - struct dentry *entry, umode_t mode) +static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; @@ -792,11 +792,11 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, struct fuse_forget_link *forget; if (fuse_is_bad(dir)) - return -EIO; + return ERR_PTR(-EIO); forget = fuse_alloc_forget(); if (!forget) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memset(&outarg, 0, sizeof(outarg)); args->nodeid = get_node_id(dir); @@ -826,29 +826,43 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } kfree(forget); d_drop(entry); d = d_splice_alias(inode, entry); if (IS_ERR(d)) - return PTR_ERR(d); + return d; - if (d) { + if (d) fuse_change_entry_timeout(d, &outarg); - dput(d); - } else { + else fuse_change_entry_timeout(entry, &outarg); - } fuse_dir_changed(dir); - return 0; + return d; out_put_forget_req: if (err == -EEXIST) fuse_invalidate_entry(entry); kfree(forget); - return err; + return ERR_PTR(err); +} + +static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) +{ + /* + * Note that when creating anything other than a directory we + * can be sure create_new_entry() will NOT return an alternate + * dentry as d_splice_alias() only returns an alternate dentry + * for directories. So we don't need to check for that case + * when passing back the result. + */ + WARN_ON_ONCE(S_ISDIR(mode)); + + return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode)); } static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, @@ -871,7 +885,7 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(idmap, fm, &args, dir, entry, mode); + return create_new_nondir(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, @@ -898,8 +912,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, return err; } -static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *entry, umode_t mode) +static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); @@ -934,7 +948,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, args.in_args[1].value = entry->d_name.name; args.in_args[2].size = len; args.in_args[2].value = link; - return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK); + return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) @@ -1131,7 +1145,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); + err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf6..366516b98b3f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); - return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, + NULL); } #define GFS2_JTRUNC_REVOKES 8192 diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c9bb3be21d2b..fd1147aa3891 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -820,7 +820,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. * * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger @@ -885,7 +885,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. * * For writes, iomap_dio_rw only triggers manual page faults, so we @@ -957,7 +957,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. */ @@ -1024,7 +1024,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. */ diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 65c07aa95718..d7220a6fe8f5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -607,14 +607,19 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) if (gh && (ret & LM_OUT_CANCELED)) gfs2_holder_wake(gh); if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { - /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { - list_move_tail(&gh->gh_list, &gl->gl_holders); + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gl->gl_target = gl->gl_state; gh = find_first_waiter(gl); - gl->gl_target = gh->gh_state; - if (do_promote(gl)) - goto out; - goto retry; + if (gh) { + gl->gl_target = gh->gh_state; + if (do_promote(gl)) + goto out; + do_xmote(gl, gh, gl->gl_target); + return; + } + goto out; } /* Some error or failed "try lock" - report it */ if ((ret & LM_OUT_ERROR) || @@ -627,7 +632,6 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) switch(state) { /* Unlocked due to conversion deadlock, try again */ case LM_ST_UNLOCKED: -retry: do_xmote(gl, gh, gl->gl_target); break; /* Conversion fails, unlock and try again */ @@ -661,7 +665,8 @@ retry: do_promote(gl); } out: - clear_bit(GLF_LOCK, &gl->gl_flags); + if (!test_bit(GLF_CANCELING, &gl->gl_flags)) + clear_bit(GLF_LOCK, &gl->gl_flags); } static bool is_system_glock(struct gfs2_glock *gl) @@ -807,6 +812,7 @@ skip_inval: } if (ls->ls_ops->lm_lock) { + set_bit(GLF_PENDING_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); ret = ls->ls_ops->lm_lock(gl, target, lck_flags); spin_lock(&gl->gl_lockref.lock); @@ -825,6 +831,7 @@ skip_inval: /* The operation will be completed asynchronously. */ return; } + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); } /* Complete the operation now. */ @@ -843,12 +850,13 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { - struct gfs2_holder *gh = NULL; + struct gfs2_holder *gh; if (test_bit(GLF_LOCK, &gl->gl_flags)) return; set_bit(GLF_LOCK, &gl->gl_flags); + /* While a demote is in progress, the GLF_LOCK flag must be set. */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && @@ -860,18 +868,22 @@ __acquires(&gl->gl_lockref.lock) set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; + do_xmote(gl, NULL, gl->gl_target); + return; } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); if (do_promote(gl)) goto out_unlock; gh = find_first_waiter(gl); + if (!gh) + goto out_unlock; gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ + do_xmote(gl, gh, gl->gl_target); + return; } - do_xmote(gl, gh, gl->gl_target); - return; out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); @@ -898,12 +910,8 @@ void glock_set_object(struct gfs2_glock *gl, void *object) prev_object = gl->gl_object; gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) { - pr_warn("glock=%u/%llx\n", - gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) gfs2_dump_glock(NULL, gl, true); - } } /** @@ -919,12 +927,8 @@ void glock_clear_object(struct gfs2_glock *gl, void *object) prev_object = gl->gl_object; gl->gl_object = NULL; spin_unlock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) { - pr_warn("glock=%u/%llx\n", - gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) gfs2_dump_glock(NULL, gl, true); - } } void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) @@ -959,6 +963,25 @@ static void gfs2_glock_poke(struct gfs2_glock *gl) gfs2_holder_uninit(&gh); } +static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip && !igrab(&ip->i_inode)) + ip = NULL; + spin_unlock(&gl->gl_lockref.lock); + if (ip) { + wait_on_inode(&ip->i_inode); + if (is_bad_inode(&ip->i_inode)) { + iput(&ip->i_inode); + ip = NULL; + } + } + return ip; +} + static void gfs2_try_evict(struct gfs2_glock *gl) { struct gfs2_inode *ip; @@ -976,32 +999,15 @@ static void gfs2_try_evict(struct gfs2_glock *gl) * happened below. (Verification is triggered by the call to * gfs2_queue_verify_delete() in gfs2_evict_inode().) */ - spin_lock(&gl->gl_lockref.lock); - ip = gl->gl_object; - if (ip && !igrab(&ip->i_inode)) - ip = NULL; - spin_unlock(&gl->gl_lockref.lock); - if (ip) { - wait_on_inode(&ip->i_inode); - if (is_bad_inode(&ip->i_inode)) { - iput(&ip->i_inode); - ip = NULL; - } - } + ip = gfs2_grab_existing_inode(gl); if (ip) { - set_bit(GIF_DEFER_DELETE, &ip->i_flags); + set_bit(GLF_DEFER_DELETE, &gl->gl_flags); d_prune_aliases(&ip->i_inode); iput(&ip->i_inode); + clear_bit(GLF_DEFER_DELETE, &gl->gl_flags); /* If the inode was evicted, gl->gl_object will now be NULL. */ - spin_lock(&gl->gl_lockref.lock); - ip = gl->gl_object; - if (ip) { - clear_bit(GIF_DEFER_DELETE, &ip->i_flags); - if (!igrab(&ip->i_inode)) - ip = NULL; - } - spin_unlock(&gl->gl_lockref.lock); + ip = gfs2_grab_existing_inode(gl); if (ip) { gfs2_glock_poke(ip->i_gl); iput(&ip->i_inode); @@ -1462,9 +1468,7 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) return true; - if (gh->gh_state == LM_ST_UNLOCKED) - return true; - return false; + return !test_bit(HIF_HOLDER, &gh->gh_iflags); } /** @@ -1483,7 +1487,6 @@ __acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct list_head *insert_pt = NULL; struct gfs2_holder *gh2; int try_futile = 0; @@ -1519,21 +1522,11 @@ fail: gfs2_holder_wake(gh); return; } - if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) - continue; } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); - if (likely(insert_pt == NULL)) { - list_add_tail(&gh->gh_list, &gl->gl_holders); - return; - } - list_add_tail(&gh->gh_list, insert_pt); - spin_unlock(&gl->gl_lockref.lock); - if (sdp->sd_lockstruct.ls_ops->lm_cancel) - sdp->sd_lockstruct.ls_ops->lm_cancel(gl); - spin_lock(&gl->gl_lockref.lock); + list_add_tail(&gh->gh_list, &gl->gl_holders); return; trap_recursive: @@ -1673,11 +1666,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh) } if (list_is_first(&gh->gh_list, &gl->gl_holders) && - !test_bit(HIF_HOLDER, &gh->gh_iflags)) { + !test_bit(HIF_HOLDER, &gh->gh_iflags) && + test_bit(GLF_LOCK, &gl->gl_flags) && + !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + !test_bit(GLF_CANCELING, &gl->gl_flags)) { + set_bit(GLF_CANCELING, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); spin_lock(&gl->gl_lockref.lock); + clear_bit(GLF_CANCELING, &gl->gl_flags); + clear_bit(GLF_LOCK, &gl->gl_flags); + if (!gfs2_holder_queued(gh)) + goto out; } /* @@ -1923,6 +1924,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; spin_lock(&gl->gl_lockref.lock); + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { @@ -2323,6 +2325,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'f'; if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) *p++ = 'i'; + if (test_bit(GLF_PENDING_REPLY, gflags)) + *p++ = 'R'; if (test_bit(GLF_HAVE_REPLY, gflags)) *p++ = 'r'; if (test_bit(GLF_INITIAL, gflags)) @@ -2347,6 +2351,10 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'e'; if (test_bit(GLF_VERIFY_DELETE, gflags)) *p++ = 'E'; + if (test_bit(GLF_DEFER_DELETE, gflags)) + *p++ = 's'; + if (test_bit(GLF_CANCELING, gflags)) + *p++ = 'C'; *p = 0; return buf; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 4e19cce3d906..74abbd4970f8 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -330,6 +330,9 @@ enum { GLF_UNLOCKED = 16, /* Wait for glock to be unlocked */ GLF_TRY_TO_EVICT = 17, /* iopen glocks only */ GLF_VERIFY_DELETE = 18, /* iopen glocks only */ + GLF_PENDING_REPLY = 19, + GLF_DEFER_DELETE = 20, /* iopen glocks only */ + GLF_CANCELING = 21, }; struct gfs2_glock { @@ -376,7 +379,6 @@ enum { GIF_SW_PAGED = 3, GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, - GIF_DEFER_DELETE = 7, }; struct gfs2_inode { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6fbbaaad1cd0..198a8cbaf5e5 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1248,14 +1248,15 @@ static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, * @dentry: The dentry of the new directory * @mode: The mode of the new directory * - * Returns: errno + * Returns: the dentry, or ERR_PTR(errno) */ -static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); - return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0); + + return ERR_PTR(gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0)); } /** diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 314ec2a70167..0fd3b5ec7d8c 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -157,7 +157,9 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) /** * gfs2_end_log_write_bh - end log write of pagecache data with buffers * @sdp: The superblock - * @bvec: The bio_vec + * @folio: The folio + * @offset: The first byte within the folio that completed + * @size: The number of bytes that completed * @error: The i/o status * * This finds the relevant buffers and unlocks them and sets the @@ -166,17 +168,13 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) * that is pinned in the pagecache. */ -static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, - struct bio_vec *bvec, - blk_status_t error) +static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct folio *folio, + size_t offset, size_t size, blk_status_t error) { struct buffer_head *bh, *next; - struct page *page = bvec->bv_page; - unsigned size; - bh = page_buffers(page); - size = bvec->bv_len; - while (bh_offset(bh) < bvec->bv_offset) + bh = folio_buffers(folio); + while (bh_offset(bh) < offset) bh = bh->b_this_page; do { if (error) @@ -186,7 +184,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, size -= bh->b_size; brelse(bh); bh = next; - } while(bh && size); + } while (bh && size); } /** @@ -203,7 +201,6 @@ static void gfs2_end_log_write(struct bio *bio) { struct gfs2_sbd *sdp = bio->bi_private; struct bio_vec *bvec; - struct page *page; struct bvec_iter_all iter_all; if (bio->bi_status) { @@ -217,9 +214,12 @@ static void gfs2_end_log_write(struct bio *bio) } bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; - if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); + struct page *page = bvec->bv_page; + struct folio *folio = page_folio(page); + + if (folio && folio_buffers(folio)) + gfs2_end_log_write_bh(sdp, folio, bvec->bv_offset, + bvec->bv_len, bio->bi_status); else mempool_free(page, gfs2_page_pool); } @@ -359,8 +359,8 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); gfs2_log_incr_head(sdp); - gfs2_log_write(sdp, sdp->sd_jdesc, bh->b_page, bh->b_size, - bh_offset(bh), dblock); + gfs2_log_write(sdp, sdp->sd_jdesc, folio_page(bh->b_folio, 0), + bh->b_size, bh_offset(bh), dblock); } /** @@ -406,17 +406,16 @@ static void gfs2_end_log_read(struct bio *bio) } /** - * gfs2_jhead_pg_srch - Look for the journal head in a given page. + * gfs2_jhead_folio_search - Look for the journal head in a given page. * @jd: The journal descriptor * @head: The journal head to start from - * @page: The page to look in + * @folio: The folio to look in * * Returns: 1 if found, 0 otherwise. */ - -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, - struct page *page) +static bool gfs2_jhead_folio_search(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct folio *folio) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host lh; @@ -424,7 +423,8 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, unsigned int offset; bool ret = false; - kaddr = kmap_local_page(page); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + kaddr = kmap_local_folio(folio, 0); for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { if (lh.lh_sequence >= head->lh_sequence) @@ -472,7 +472,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, *done = true; if (!*done) - *done = gfs2_jhead_pg_srch(jd, head, &folio->page); + *done = gfs2_jhead_folio_search(jd, head, folio); /* filemap_get_folio() and the earlier grab_cache_page() */ folio_put_refs(folio, 2); @@ -512,9 +512,9 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, unsigned int shift = PAGE_SHIFT - bsize_shift; unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift; struct gfs2_journal_extent *je; - int sz, ret = 0; + int ret = 0; struct bio *bio = NULL; - struct page *page = NULL; + struct folio *folio = NULL; bool done = false; errseq_t since; @@ -527,10 +527,11 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, u64 dblock = je->dblock; for (; block < je->lblock + je->blocks; block++, dblock++) { - if (!page) { - page = grab_cache_page(mapping, block >> shift); - if (!page) { - ret = -ENOMEM; + if (!folio) { + folio = filemap_grab_folio(mapping, + block >> shift); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); done = true; goto out; } @@ -541,8 +542,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, sector_t sector = dblock << sdp->sd_fsb2bb_shift; if (bio_end_sector(bio) == sector) { - sz = bio_add_page(bio, page, bsize, off); - if (sz == bsize) + if (bio_add_folio(bio, folio, bsize, off)) goto block_added; } if (off) { @@ -562,12 +562,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); bio->bi_opf = REQ_OP_READ; add_block_to_new_bio: - sz = bio_add_page(bio, page, bsize, off); - BUG_ON(sz != bsize); + if (!bio_add_folio(bio, folio, bsize, off)) + BUG(); block_added: off += bsize; - if (off == PAGE_SIZE) - page = NULL; + if (off == folio_size(folio)) + folio = NULL; if (blocks_submitted <= blocks_read + max_blocks) { /* Keep at least one bio in flight */ continue; @@ -615,15 +615,13 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, static void gfs2_check_magic(struct buffer_head *bh) { - void *kaddr; __be32 *ptr; clear_buffer_escaped(bh); - kaddr = kmap_local_page(bh->b_page); - ptr = kaddr + bh_offset(bh); + ptr = kmap_local_folio(bh->b_folio, bh_offset(bh)); if (*ptr == cpu_to_be32(GFS2_MAGIC)) set_buffer_escaped(bh); - kunmap_local(kaddr); + kunmap_local(ptr); } static int blocknr_cmp(void *priv, const struct list_head *a, diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index fea3efcc2f93..198cc7056637 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -198,15 +198,14 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) static void gfs2_meta_read_endio(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct buffer_head *bh = page_buffers(page); - unsigned int len = bvec->bv_len; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct buffer_head *bh = folio_buffers(folio); + size_t len = fi.length; - while (bh_offset(bh) < bvec->bv_offset) + while (bh_offset(bh) < fi.offset) bh = bh->b_this_page; do { struct buffer_head *next = bh->b_this_page; @@ -232,7 +231,7 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num) bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); while (num > 0) { bh = *bhs; - if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { + if (!bio_add_folio(bio, bh->b_folio, bh->b_size, bh_offset(bh))) { BUG_ON(bio->bi_iter.bi_size == 0); break; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 92a3b6ddafdc..44e5658b896c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1329,7 +1329,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode, if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) goto should_delete; - if (test_bit(GIF_DEFER_DELETE, &ip->i_flags)) + if (gfs2_holder_initialized(&ip->i_iopen_gh) && + test_bit(GLF_DEFER_DELETE, &ip->i_iopen_gh.gh_gl->gl_flags)) return EVICT_SHOULD_DEFER_DELETE; /* Deletes should never happen under memory pressure anymore. */ @@ -1338,12 +1339,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode, /* Must not read inode block until block type has been verified */ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh); - if (unlikely(ret)) { - glock_clear_object(ip->i_iopen_gh.gh_gl, ip); - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - return EVICT_SHOULD_DEFER_DELETE; - } + if (unlikely(ret)) + return EVICT_SHOULD_SKIP_DELETE; if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) return EVICT_SHOULD_SKIP_DELETE; @@ -1363,15 +1360,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode, should_delete: if (gfs2_holder_initialized(&ip->i_iopen_gh) && - test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - enum evict_behavior behavior = - gfs2_upgrade_iopen_glock(inode); - - if (behavior != EVICT_SHOULD_DELETE) { - gfs2_holder_uninit(&ip->i_iopen_gh); - return behavior; - } - } + test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + return gfs2_upgrade_iopen_glock(inode); return EVICT_SHOULD_DELETE; } @@ -1509,7 +1499,7 @@ static void gfs2_evict_inode(struct inode *inode) gfs2_glock_put(io_gl); goto out; } - behavior = EVICT_SHOULD_DELETE; + behavior = EVICT_SHOULD_SKIP_DELETE; } if (behavior == EVICT_SHOULD_DELETE) ret = evict_unlinked_inode(inode); diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 8eae8d62a413..26036ffc3f33 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -53,12 +53,20 @@ {(1UL << GLF_DIRTY), "y" }, \ {(1UL << GLF_LFLUSH), "f" }, \ {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ + {(1UL << GLF_PENDING_REPLY), "R" }, \ {(1UL << GLF_HAVE_REPLY), "r" }, \ {(1UL << GLF_INITIAL), "a" }, \ {(1UL << GLF_HAVE_FROZEN_REPLY), "F" }, \ {(1UL << GLF_LRU), "L" }, \ {(1UL << GLF_OBJECT), "o" }, \ - {(1UL << GLF_BLOCKING), "b" }) + {(1UL << GLF_BLOCKING), "b" }, \ + {(1UL << GLF_UNLOCKED), "x" }, \ + {(1UL << GLF_INSTANTIATE_NEEDED), "n" }, \ + {(1UL << GLF_INSTANTIATE_IN_PROG), "N" }, \ + {(1UL << GLF_TRY_TO_EVICT), "e" }, \ + {(1UL << GLF_VERIFY_DELETE), "E" }, \ + {(1UL << GLF_DEFER_DELETE), "s" }, \ + {(1UL << GLF_CANCELING), "C" }) #ifndef NUMPTY #define NUMPTY diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 192213c7359a..f8ae2c666fd6 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -246,12 +246,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) if (bd == NULL) { gfs2_log_unlock(sdp); unlock_buffer(bh); - lock_page(bh->b_page); + folio_lock(bh->b_folio); if (bh->b_private == NULL) bd = gfs2_alloc_bufdata(gl, bh); else bd = bh->b_private; - unlock_page(bh->b_page); + folio_unlock(bh->b_folio); lock_buffer(bh); gfs2_log_lock(sdp); } diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index b75c26045df4..86a6b317b474 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -219,26 +219,26 @@ static int hfs_create(struct mnt_idmap *idmap, struct inode *dir, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); - return res; + return ERR_PTR(res); } d_instantiate(dentry, inode); mark_inode_dirty(inode); - return 0; + return NULL; } /* diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index f5c4b3e31a1c..876bbb80fb4d 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -523,10 +523,10 @@ static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir, return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); + return ERR_PTR(hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0)); } static int hfsplus_rename(struct mnt_idmap *idmap, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e0741e468956..a2c6b9051c5b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -679,17 +679,25 @@ static int hostfs_symlink(struct mnt_idmap *idmap, struct inode *ino, return err; } -static int hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino, - struct dentry *dentry, umode_t mode) +static struct dentry *hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino, + struct dentry *dentry, umode_t mode) { + struct inode *inode; char *file; int err; if ((file = dentry_name(dentry)) == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); err = do_mkdir(file, mode); + if (err) { + dentry = ERR_PTR(err); + } else { + inode = hostfs_iget(dentry->d_sb, file); + d_drop(dentry); + dentry = d_splice_alias(inode, dentry); + } __putname(file); - return err; + return dentry; } static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index d0edf9ed33b6..e3cdc421dfba 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -19,8 +19,8 @@ static void hpfs_update_directory_times(struct inode *dir) hpfs_write_inode_nolock(dir); } -static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -35,7 +35,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, int r; struct hpfs_dirent dee; int err; - if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; + if ((err = hpfs_chk_name(name, &len))) return ERR_PTR(err==-ENOENT ? -EINVAL : err); hpfs_lock(dir->i_sb); err = -ENOSPC; fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); @@ -112,7 +112,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); - return 0; + return NULL; bail3: iput(result); bail2: @@ -123,7 +123,7 @@ bail1: hpfs_free_sectors(dir->i_sb, fno, 1); bail: hpfs_unlock(dir->i_sb); - return err; + return ERR_PTR(err); } static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0fc179a59830..d98caedbb723 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -991,14 +991,14 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, return 0; } -static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); - return retval; + return ERR_PTR(retval); } static int hugetlbfs_create(struct mnt_idmap *idmap, diff --git a/fs/init.c b/fs/init.c index e9387b6c4f30..eef5124885e3 100644 --- a/fs/init.c +++ b/fs/init.c @@ -230,9 +230,12 @@ int __init init_mkdir(const char *pathname, umode_t mode) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mkdir(&path, dentry, mode); - if (!error) - error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + if (!error) { + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); + if (IS_ERR(dentry)) + error = PTR_ERR(dentry); + } done_path_create(&path, dentry); return error; } diff --git a/fs/inode.c b/fs/inode.c index 5587aabdaa5e..99318b157a9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -327,7 +327,17 @@ static void i_callback(struct rcu_head *head) free_inode_nonrcu(inode); } -static struct inode *alloc_inode(struct super_block *sb) +/** + * alloc_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *alloc_inode(struct super_block *sb) { const struct super_operations *ops = sb->s_op; struct inode *inode; @@ -613,18 +623,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode->i_sb->s_inode_list_lock); + struct super_block *sb = inode->i_sb; + + spin_lock(&sb->s_inode_list_lock); + list_add(&inode->i_sb_list, &sb->s_inodes); + spin_unlock(&sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { + struct super_block *sb = inode->i_sb; + if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode->i_sb->s_inode_list_lock); + spin_lock(&sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); - spin_unlock(&inode->i_sb->s_inode_list_lock); + spin_unlock(&sb->s_inode_list_lock); } } @@ -806,23 +820,16 @@ static void evict(struct inode *inode) /* * Wake up waiters in __wait_on_freeing_inode(). * - * Lockless hash lookup may end up finding the inode before we removed - * it above, but only lock it *after* we are done with the wakeup below. - * In this case the potential waiter cannot safely block. + * It is an invariant that any thread we need to wake up is already + * accounted for before remove_inode_hash() acquires ->i_lock -- both + * sides take the lock and sleep is aborted if the inode is found + * unhashed. Thus either the sleeper wins and goes off CPU, or removal + * wins and the sleeper aborts after testing with the lock. * - * The inode being unhashed after the call to remove_inode_hash() is - * used as an indicator whether blocking on it is safe. + * This also means we don't need any fences for the call below. */ - spin_lock(&inode->i_lock); - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); - spin_unlock(&inode->i_lock); destroy_inode(inode); } @@ -900,46 +907,6 @@ again: } EXPORT_SYMBOL_GPL(evict_inodes); -/** - * invalidate_inodes - attempt to free all inodes on a superblock - * @sb: superblock to operate on - * - * Attempts to free all inodes (including dirty inodes) for a given superblock. - */ -void invalidate_inodes(struct super_block *sb) -{ - struct inode *inode, *next; - LIST_HEAD(dispose); - -again: - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - spin_lock(&inode->i_lock); - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - spin_unlock(&inode->i_lock); - continue; - } - if (atomic_read(&inode->i_count)) { - spin_unlock(&inode->i_lock); - continue; - } - - inode->i_state |= I_FREEING; - inode_lru_list_del(inode); - spin_unlock(&inode->i_lock); - list_add(&inode->i_lru, &dispose); - if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); - cond_resched(); - dispose_list(&dispose); - goto again; - } - } - spin_unlock(&sb->s_inode_list_lock); - - dispose_list(&dispose); -} - /* * Isolate the inode from the LRU in preparation for freeing it. * @@ -1160,21 +1127,6 @@ unsigned int get_next_ino(void) EXPORT_SYMBOL(get_next_ino); /** - * new_inode_pseudo - obtain an inode - * @sb: superblock - * - * Allocates a new inode for given superblock. - * Inode wont be chained in superblock s_inodes list - * This means : - * - fs can't be unmount - * - quotas, fsnotify, writeback can't work - */ -struct inode *new_inode_pseudo(struct super_block *sb) -{ - return alloc_inode(sb); -} - -/** * new_inode - obtain an inode * @sb: superblock * @@ -1190,7 +1142,7 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - inode = new_inode_pseudo(sb); + inode = alloc_inode(sb); if (inode) inode_sb_list_add(inode); return inode; @@ -1348,8 +1300,8 @@ again: } if (set && unlikely(set(inode, data))) { - inode = NULL; - goto unlock; + spin_unlock(&inode_hash_lock); + return NULL; } /* @@ -1361,14 +1313,14 @@ again: hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + /* * Add inode to the sb list if it's not already. It has I_NEW at this * point, so it should be safe to test i_sb_list locklessly. */ if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); -unlock: - spin_unlock(&inode_hash_lock); return inode; } @@ -1497,8 +1449,8 @@ again: inode->i_state = I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); - inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); + inode_sb_list_add(inode); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -2953,3 +2905,18 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap, return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); + +#ifdef CONFIG_DEBUG_VFS +/* + * Dump an inode. + * + * TODO: add a proper inode dumping routine, this is a stub to get debug off the + * ground. + */ +void dump_inode(struct inode *inode, const char *reason) +{ + pr_warn("%s encountered for inode %px", reason, inode); +} + +EXPORT_SYMBOL(dump_inode); +#endif diff --git a/fs/internal.h b/fs/internal.h index e7f02ae1e098..b9b3e29a73fd 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -118,6 +118,9 @@ static inline void put_file_access(struct file *file) } } +void fput_close_sync(struct file *); +void fput_close(struct file *); + /* * super.c */ @@ -187,8 +190,8 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); -long do_ftruncate(struct file *file, loff_t length, int small); -long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +int do_ftruncate(struct file *file, loff_t length, int small); +int do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); @@ -207,7 +210,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap, * fs-writeback.c */ extern long get_nr_dirty_inodes(void); -void invalidate_inodes(struct super_block *sb); /* * dcache.c @@ -325,6 +327,7 @@ struct stashed_operations { int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +struct dentry *stashed_dentry_get(struct dentry **stashed); /** * path_mounted - check whether path is mounted * @path: path to check @@ -338,3 +341,5 @@ static inline bool path_mounted(const struct path *path) return path->mnt->mnt_root == path->dentry; } void file_f_owner_release(struct file *file); +bool file_seek_cur_needs_f_lock(struct file *file); +int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map); diff --git a/fs/ioctl.c b/fs/ioctl.c index 638a36be31c1..c91fd2b46a77 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -228,8 +228,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) return error; } -static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) { CLASS(fd, src_file)(srcfd); loff_t cloned; @@ -248,8 +248,8 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, return ret; } -static long ioctl_file_clone_range(struct file *file, - struct file_clone_range __user *argp) +static int ioctl_file_clone_range(struct file *file, + struct file_clone_range __user *argp) { struct file_clone_range args; diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 381d76c5c232..69e8ebb41302 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -12,6 +12,7 @@ iomap-y += trace.o \ iter.o iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ + ioend.o \ fiemap.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..814b7f679486 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -12,17 +12,15 @@ #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/writeback.h> -#include <linux/list_sort.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> +#include "internal.h" #include "trace.h" #include "../internal.h" -#define IOEND_BATCH_SIZE 4096 - /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. @@ -40,8 +38,6 @@ struct iomap_folio_state { unsigned long state[]; }; -static struct bio_set iomap_ioend_bioset; - static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) { @@ -366,20 +362,24 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, pos >= i_size_read(iter->inode); } -static loff_t iomap_readpage_iter(const struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx, loff_t offset) +static int iomap_readpage_iter(struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) { const struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos + offset; - loff_t length = iomap_length(iter) - offset; + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; struct iomap_folio_state *ifs; - loff_t orig_pos = pos; size_t poff, plen; sector_t sector; + int ret; - if (iomap->type == IOMAP_INLINE) - return iomap_read_inline_data(iter, folio); + if (iomap->type == IOMAP_INLINE) { + ret = iomap_read_inline_data(iter, folio); + if (ret) + return ret; + return iomap_iter_advance(iter, &length); + } /* zero post-eof blocks as the page may be mapped */ ifs = ifs_alloc(iter->inode, folio, iter->flags); @@ -438,25 +438,22 @@ done: * we can skip trailing ones as they will be handled in the next * iteration. */ - return pos - orig_pos + plen; + length = pos - iter->pos + plen; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_read_folio_iter(const struct iomap_iter *iter, +static int iomap_read_folio_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - struct folio *folio = ctx->cur_folio; - size_t offset = offset_in_folio(folio, iter->pos); - loff_t length = min_t(loff_t, folio_size(folio) - offset, - iomap_length(iter)); - loff_t done, ret; - - for (done = 0; done < length; done += ret) { - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + int ret; + + while (iomap_length(iter)) { + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) @@ -474,7 +471,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, &ctx); if (ctx.bio) { submit_bio(ctx.bio); @@ -493,15 +490,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_read_folio); -static loff_t iomap_readahead_iter(const struct iomap_iter *iter, +static int iomap_readahead_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - loff_t length = iomap_length(iter); - loff_t done, ret; + int ret; - for (done = 0; done < length; done += ret) { + while (iomap_length(iter)) { if (ctx->cur_folio && - offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { + offset_in_folio(ctx->cur_folio, iter->pos) == 0) { if (!ctx->cur_folio_in_bio) folio_unlock(ctx->cur_folio); ctx->cur_folio = NULL; @@ -510,12 +506,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter, ctx->cur_folio = readahead_folio(ctx->rac); ctx->cur_folio_in_bio = false; } - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } /** @@ -547,7 +543,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.processed = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); @@ -603,6 +599,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + if (iter->flags & IOMAP_DONTCACHE) + fgp |= FGP_DONTCACHE; fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, @@ -907,12 +905,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, return __iomap_write_end(iter->inode, pos, len, copied, folio); } -static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) +static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - loff_t length = iomap_length(iter); - loff_t pos = iter->pos; ssize_t total_written = 0; - long status = 0; + int status = 0; struct address_space *mapping = iter->inode->i_mapping; size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; @@ -923,7 +919,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - size_t written; /* Bytes have been written */ + u64 written; /* Bytes have been written */ + loff_t pos = iter->pos; bytes = iov_iter_count(i); retry: @@ -934,8 +931,8 @@ retry: if (unlikely(status)) break; - if (bytes > length) - bytes = length; + if (bytes > iomap_length(iter)) + bytes = iomap_length(iter); /* * Bring in the user page that we'll copy from _first_. @@ -1006,17 +1003,12 @@ retry: goto retry; } } else { - pos += written; total_written += written; - length -= written; + iomap_iter_advance(iter, &written); } - } while (iov_iter_count(i) && length); + } while (iov_iter_count(i) && iomap_length(iter)); - if (status == -EAGAIN) { - iov_iter_revert(i, total_written); - return -EAGAIN; - } - return total_written ? total_written : status; + return total_written ? 0 : status; } ssize_t @@ -1034,9 +1026,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_DONTCACHE) + iter.flags |= IOMAP_DONTCACHE; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_write_iter(&iter, i); + iter.status = iomap_write_iter(&iter, i); if (unlikely(iter.pos == iocb->ki_pos)) return ret; @@ -1270,23 +1264,22 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, } EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); -static loff_t iomap_unshare_iter(struct iomap_iter *iter) +static int iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; if (!iomap_want_unshare_iter(iter)) - return length; + return iomap_iter_advance(iter, &bytes); do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; @@ -1304,14 +1297,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) cond_resched(); - pos += bytes; - written += bytes; - length -= bytes; - balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length > 0); - return written; + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); + + return status; } int @@ -1331,7 +1324,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_unshare_iter(&iter); + iter.status = iomap_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); @@ -1350,19 +1343,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) return filemap_write_and_wait_range(mapping, i->pos, end); } -static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; @@ -1383,25 +1375,26 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) if (WARN_ON_ONCE(!ret)) return -EIO; - pos += bytes; - length -= bytes; - written += bytes; - } while (length > 0); + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); if (did_zero) *did_zero = true; - return written; + return status; } int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, + .private = private, }; struct address_space *mapping = inode->i_mapping; unsigned int blocksize = i_blocksize(inode); @@ -1424,7 +1417,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); iter.len = len - (iter.pos - pos); if (ret || !iter.len) @@ -1443,17 +1436,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { - loff_t proc = iomap_length(&iter); + s64 status; if (range_dirty) { range_dirty = false; - proc = iomap_zero_iter_flush_and_stale(&iter); + status = iomap_zero_iter_flush_and_stale(&iter); + } else { + status = iomap_iter_advance_full(&iter); } - iter.processed = proc; + iter.status = status; continue; } - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); } return ret; } @@ -1461,7 +1456,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); @@ -1469,11 +1464,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, /* Block boundary? Nothing to do */ if (!off) return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, + private); } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, +static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, struct folio *folio) { loff_t length = iomap_length(iter); @@ -1490,14 +1486,16 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, folio_mark_dirty(folio); } - return length; + return iomap_iter_advance(iter, &length); } -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private) { struct iomap_iter iter = { .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, + .private = private, }; struct folio *folio = page_folio(vmf->page); ssize_t ret; @@ -1509,7 +1507,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) iter.pos = folio_pos(folio); iter.len = ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_folio_mkwrite_iter(&iter, folio); + iter.status = iomap_folio_mkwrite_iter(&iter, folio); if (ret < 0) goto out_unlock; @@ -1538,16 +1536,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; struct folio_iter fi; u32 folio_count = 0; - if (error) { - mapping_set_error(inode->i_mapping, error); + if (ioend->io_error) { + mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", @@ -1566,116 +1563,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) return folio_count; } -/* - * Ioend completion routine for merged bios. This can only be called from task - * contexts as merged ioends can be of unbound length. Hence we have to break up - * the writeback completions into manageable chunks to avoid long scheduler - * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get - * good batch processing throughput without creating adverse scheduler latency - * conditions. - */ -void -iomap_finish_ioends(struct iomap_ioend *ioend, int error) -{ - struct list_head tmp; - u32 completions; - - might_sleep(); - - list_replace_init(&ioend->io_list, &tmp); - completions = iomap_finish_ioend(ioend, error); - - while (!list_empty(&tmp)) { - if (completions > IOEND_BATCH_SIZE * 8) { - cond_resched(); - completions = 0; - } - ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); - list_del_init(&ioend->io_list); - completions += iomap_finish_ioend(ioend, error); - } -} -EXPORT_SYMBOL_GPL(iomap_finish_ioends); - -/* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) -{ - if (ioend->io_bio.bi_status != next->io_bio.bi_status) - return false; - if (next->io_flags & IOMAP_F_BOUNDARY) - return false; - if ((ioend->io_flags & IOMAP_F_SHARED) ^ - (next->io_flags & IOMAP_F_SHARED)) - return false; - if ((ioend->io_type == IOMAP_UNWRITTEN) ^ - (next->io_type == IOMAP_UNWRITTEN)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - /* - * Do not merge physically discontiguous ioends. The filesystem - * completion functions will have to iterate the physical - * discontiguities even if we merge the ioends at a logical level, so - * we don't gain anything by merging physical discontiguities here. - * - * We cannot use bio->bi_iter.bi_sector here as it is modified during - * submission so does not point to the start sector of the bio at - * completion. - */ - if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) - return false; - return true; -} - -void -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) -{ - struct iomap_ioend *next; - - INIT_LIST_HEAD(&ioend->io_list); - - while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, - io_list))) { - if (!iomap_ioend_can_merge(ioend, next)) - break; - list_move_tail(&next->io_list, &ioend->io_list); - ioend->io_size += next->io_size; - } -} -EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); - -static int -iomap_ioend_compare(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); - struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); - - if (ia->io_offset < ib->io_offset) - return -1; - if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - -void -iomap_sort_ioends(struct list_head *ioend_list) -{ - list_sort(NULL, ioend_list, iomap_ioend_compare); -} -EXPORT_SYMBOL_GPL(iomap_sort_ioends); - static void iomap_writepage_end_bio(struct bio *bio) { - iomap_finish_ioend(iomap_ioend_from_bio(bio), - blk_status_to_errno(bio->bi_status)); + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + ioend->io_error = blk_status_to_errno(bio->bi_status); + iomap_finish_ioend_buffered(ioend); } /* - * Submit the final bio for an ioend. + * Submit an ioend. * * If @error is non-zero, it means that we have a situation where some part of * the submission process has failed after we've marked pages for writeback. @@ -1694,14 +1591,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (wpc->ops->submit_ioend) { + error = wpc->ops->submit_ioend(wpc, error); + } else { + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) + error = -EIO; + if (!error) + submit_bio(&wpc->ioend->io_bio); + } if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); bio_endio(&wpc->ioend->io_bio); - } else { - submit_bio(&wpc->ioend->io_bio); } wpc->ioend = NULL; @@ -1709,9 +1610,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) } static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, loff_t pos) + struct writeback_control *wbc, struct inode *inode, loff_t pos, + u16 ioend_flags) { - struct iomap_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, @@ -1719,36 +1620,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_end_io = iomap_writepage_end_bio; - wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; - - ioend = iomap_ioend_from_bio(bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = wpc->iomap.type; - ioend->io_flags = wpc->iomap.flags; - if (pos > wpc->iomap.offset) - wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = pos; - ioend->io_sector = bio->bi_iter.bi_sector; - + wbc_init_bio(wbc, bio); wpc->nr_folios = 0; - return ioend; + return iomap_init_ioend(inode, bio, pos, ioend_flags); } -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, + u16 ioend_flags) { - if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) - return false; - if ((wpc->iomap.flags & IOMAP_F_SHARED) != - (wpc->ioend->io_flags & IOMAP_F_SHARED)) + if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; - if (wpc->iomap.type != wpc->ioend->io_type) + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (iomap_sector(&wpc->iomap, pos) != + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && + iomap_sector(&wpc->iomap, pos) != bio_end_sector(&wpc->ioend->io_bio)) return false; /* @@ -1779,14 +1668,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: error = iomap_submit_ioend(wpc, 0); if (error) return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, + ioend_flags); } if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) @@ -2062,11 +1960,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, return iomap_submit_ioend(wpc, error); } EXPORT_SYMBOL_GPL(iomap_writepages); - -static int __init iomap_buffered_init(void) -{ - return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_bio), - BIOSET_NEED_BVECS); -} -fs_initcall(iomap_buffered_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 0e47da82b0c2..844261a31156 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2021 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -12,6 +12,7 @@ #include <linux/backing-dev.h> #include <linux/uio.h> #include <linux/task_io_accounting_ops.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -20,6 +21,7 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) @@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, WRITE_ONCE(iocb->private, bio); } - if (dio->dops && dio->dops->submit_io) + if (dio->dops && dio->dops->submit_io) { dio->dops->submit_io(iter, bio, pos); - else + } else { + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); submit_bio(bio); + } } ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); @@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) cmpxchg(&dio->error, 0, ret); } -void iomap_dio_bio_end_io(struct bio *bio) +/* + * Called when dio->ref reaches zero from an I/O completion. + */ +static void iomap_dio_done(struct iomap_dio *dio) { - struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); struct kiocb *iocb = dio->iocb; - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - if (!atomic_dec_and_test(&dio->ref)) - goto release_bio; - - /* - * Synchronous dio, task itself will handle any completion work - * that needs after IO. All we need to do is wake the task. - */ if (dio->wait_for_completion) { + /* + * Synchronous I/O, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - goto release_bio; - } - - /* - * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline - */ - if (dio->flags & IOMAP_DIO_INLINE_COMP) { + } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { WRITE_ONCE(iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); - goto release_bio; - } - - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule - * our completion that way to avoid an async punt to a workqueue. - */ - if (dio->flags & IOMAP_DIO_CALLER_COMP) { + } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then + * schedule our completion that way to avoid an async punt to a + * workqueue. + */ /* only polled IO cares about private cleared */ iocb->private = dio; iocb->dio_complete = iomap_dio_deferred_complete; @@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio) * issuer. */ iocb->ki_complete(iocb, 0); - goto release_bio; + } else { + struct inode *inode = file_inode(iocb->ki_filp); + + /* + * Async DIO completion that requires filesystem level + * completion work gets punted to a work queue to complete as + * the operation may require more IO to be issued to finalise + * filesystem metadata changes or guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); } +} + +void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + + if (atomic_dec_and_test(&dio->ref)) + iomap_dio_done(dio); - /* - * Async DIO completion that requires filesystem level completion work - * gets punted to a work queue to complete as the operation may require - * more IO to be issued to finalise filesystem metadata changes or - * guarantee data integrity. - */ - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, - &dio->aio.work); -release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -239,6 +244,47 @@ release_bio: } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) +{ + struct iomap_dio *dio = ioend->io_bio.bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + u32 vec_count = ioend->io_bio.bi_vcnt; + + if (ioend->io_error) + iomap_dio_set_error(dio, ioend->io_error); + + if (atomic_dec_and_test(&dio->ref)) { + /* + * Try to avoid another context switch for the completion given + * that we are already called from the ioend completion + * workqueue, but never invalidate pages from this thread to + * avoid deadlocks with buffered I/O completions. Tough luck if + * you hit the tiny race with someone dirtying the range now + * between this check and the actual completion. + */ + if (!dio->iocb->ki_filp->f_mapping->nrpages) { + dio->flags |= IOMAP_DIO_INLINE_COMP; + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + iomap_dio_done(dio); + } + + if (should_dirty) { + bio_check_pages_dirty(&ioend->io_bio); + } else { + bio_release_pages(&ioend->io_bio, false); + bio_put(&ioend->io_bio); + } + + /* + * Return the number of bvecs completed as even direct I/O completions + * do significant per-folio work and we'll still want to give up the + * CPU after a lot of completions. + */ + return vec_count; +} + static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { @@ -266,81 +312,85 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, } /* - * Figure out the bio's operation flags from the dio request, the - * mapping, and whether or not we want FUA. Note that we can end up - * clearing the WRITE_THROUGH flag in the dio request. + * Use a FUA write if we need datasync semantics and this is a pure data I/O + * that doesn't require any metadata updates (including after I/O completion + * such as unwritten extent conversion) and the underlying device either + * doesn't have a volatile write cache or supports FUA. + * This allows us to avoid cache flushes on I/O completion. */ -static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua, bool atomic) +static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, + struct iomap_dio *dio) { - blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - - if (!(dio->flags & IOMAP_DIO_WRITE)) - return REQ_OP_READ; - - opflags |= REQ_OP_WRITE; - if (use_fua) - opflags |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; - if (atomic) - opflags |= REQ_ATOMIC; - - return opflags; + if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) + return false; + if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) + return false; + return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); } -static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; const loff_t length = iomap_length(iter); - bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; - blk_opf_t bio_opf; + blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE; struct bio *bio; bool need_zeroout = false; - bool use_fua = false; int nr_pages, ret = 0; - size_t copied = 0; + u64 copied = 0; size_t orig_count; - if (atomic && length != fs_block_size) - return -EINVAL; - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; - if (iomap->type == IOMAP_UNWRITTEN) { - dio->flags |= IOMAP_DIO_UNWRITTEN; - need_zeroout = true; - } + if (dio->flags & IOMAP_DIO_WRITE) { + bio_opf |= REQ_OP_WRITE; + + if (iomap->flags & IOMAP_F_ATOMIC_BIO) { + /* + * Ensure that the mapping covers the full write + * length, otherwise it won't be submitted as a single + * bio, which is required to use hardware atomics. + */ + if (length != iter->len) + return -EINVAL; + bio_opf |= REQ_ATOMIC; + } - if (iomap->flags & IOMAP_F_SHARED) - dio->flags |= IOMAP_DIO_COW; + if (iomap->type == IOMAP_UNWRITTEN) { + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + } + + if (iomap->flags & IOMAP_F_SHARED) + dio->flags |= IOMAP_DIO_COW; + + if (iomap->flags & IOMAP_F_NEW) { + need_zeroout = true; + } else if (iomap->type == IOMAP_MAPPED) { + if (iomap_dio_can_use_fua(iomap, dio)) + bio_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + } - if (iomap->flags & IOMAP_F_NEW) { - need_zeroout = true; - } else if (iomap->type == IOMAP_MAPPED) { /* - * Use a FUA write if we need datasync semantics, this is a pure - * data IO that doesn't require any metadata updates (including - * after IO completion such as unwritten extent conversion) and - * the underlying device either supports FUA or doesn't have - * a volatile write cache. This allows us to avoid cache flushes - * on IO completion. If we can't use writethrough and need to - * sync, disable in-task completions as dio completion will - * need to call generic_write_sync() which will do a blocking - * fsync / cache flush call. + * We can only do deferred completion for pure overwrites that + * don't require additional I/O at completion time. + * + * This rules out writes that need zeroing or extent conversion, + * extend the file size, or issue metadata I/O or cache flushes + * during completion processing. */ - if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_THROUGH) && - (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) - use_fua = true; - else if (dio->flags & IOMAP_DIO_NEED_SYNC) + if (need_zeroout || (pos >= i_size_read(inode)) || + ((dio->flags & IOMAP_DIO_NEED_SYNC) && + !(bio_opf & REQ_FUA))) dio->flags &= ~IOMAP_DIO_CALLER_COMP; + } else { + bio_opf |= REQ_OP_READ; } /* @@ -355,18 +405,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; /* - * We can only do deferred completion for pure overwrites that - * don't require additional IO at completion. This rules out - * writes that need zeroing or extent conversion, extend - * the file size, or issue journal IO or cache flushes - * during completion processing. - */ - if (need_zeroout || - ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || - ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) - dio->flags &= ~IOMAP_DIO_CALLER_COMP; - - /* * The rules for polled IO completions follow the guidelines as the * ones we set for inline and deferred completions. If none of those * are available for this IO, clear the polled flag. @@ -383,8 +421,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { size_t n; @@ -416,9 +452,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; - if (WARN_ON_ONCE(atomic && n != length)) { + if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) { /* - * This bio should have covered the complete length, + * An atomic write bio must cover the complete length, * which it doesn't, so error. We may need to zero out * the tail (complete FS block), similar to when * bio_iov_iter_get_pages() returns an error, above. @@ -465,30 +501,28 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return copied; + return iomap_iter_advance(iter, &copied); return ret; } -static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) { loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); dio->size += length; if (!length) return -EFAULT; - return length; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, - struct iomap_dio *dio) +static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) { const struct iomap *iomap = &iomi->iomap; struct iov_iter *iter = dio->submit.iter; void *inline_data = iomap_inline_data(iomap, iomi->pos); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; - size_t copied; + u64 copied; if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; @@ -510,11 +544,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, dio->size += copied; if (!copied) return -EFAULT; - return copied; + return iomap_iter_advance(iomi, &copied); } -static loff_t iomap_dio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { switch (iter->iomap.type) { case IOMAP_HOLE: @@ -608,9 +641,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iocb->ki_flags & IOCB_ATOMIC) - iomi.flags |= IOMAP_ATOMIC; - if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -645,6 +675,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_OVERWRITE_ONLY; } + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + /* for data sync or sync, we need sync completion processing */ if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; @@ -698,7 +731,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); while ((ret = iomap_iter(&iomi, ops)) > 0) { - iomi.processed = iomap_dio_iter(&iomi, dio); + iomi.status = iomap_dio_iter(&iomi, dio); /* * We can only poll for single bio I/Os. diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 610ca6f1ec9b..80675c42e94e 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, iomap->length, flags); } -static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, +static int iomap_fiemap_iter(struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { int ret; if (iter->iomap.type == IOMAP_HOLE) - return iomap_length(iter); + goto advance; ret = iomap_to_fiemap(fi, prev, 0); *prev = iter->iomap; - switch (ret) { - case 0: /* success */ - return iomap_length(iter); - case 1: /* extent array full */ - return 0; - default: /* error */ + if (ret < 0) return ret; - } + if (ret == 1) /* extent array full */ + return 0; + +advance: + return iomap_iter_advance_full(iter); } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, @@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_fiemap_iter(&iter, fi, &prev); + iter.status = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); @@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno, while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; - /* leave iter.processed unset to abort loop */ + /* leave iter.status unset to abort loop */ } if (ret) return 0; diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h new file mode 100644 index 000000000000..f6992a3bf66a --- /dev/null +++ b/fs/iomap/internal.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IOMAP_INTERNAL_H +#define _IOMAP_INTERNAL_H 1 + +#define IOEND_BATCH_SIZE 4096 + +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); + +#endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c new file mode 100644 index 000000000000..18894ebba6db --- /dev/null +++ b/fs/iomap/ioend.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024-2025 Christoph Hellwig. + */ +#include <linux/iomap.h> +#include <linux/list_sort.h> +#include "internal.h" + +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); + +struct iomap_ioend *iomap_init_ioend(struct inode *inode, + struct bio *bio, loff_t file_offset, u16 ioend_flags) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_parent = NULL; + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_flags = ioend_flags; + ioend->io_inode = inode; + ioend->io_offset = file_offset; + ioend->io_size = bio->bi_iter.bi_size; + ioend->io_sector = bio->bi_iter.bi_sector; + ioend->io_private = NULL; + return ioend; +} +EXPORT_SYMBOL_GPL(iomap_init_ioend); + +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + if (ioend->io_flags & IOMAP_IOEND_DIRECT) + return iomap_finish_ioend_direct(ioend); + return iomap_finish_ioend_buffered(ioend); +} + +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ +void iomap_finish_ioends(struct iomap_ioend *ioend, int error) +{ + struct list_head tmp; + u32 completions; + + might_sleep(); + + list_replace_init(&ioend->io_list, &tmp); + completions = iomap_finish_ioend(ioend, error); + + while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + completions += iomap_finish_ioend(ioend, error); + } +} +EXPORT_SYMBOL_GPL(iomap_finish_ioends); + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, + struct iomap_ioend *next) +{ + if (ioend->io_bio.bi_status != next->io_bio.bi_status) + return false; + if (next->io_flags & IOMAP_IOEND_BOUNDARY) + return false; + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) != + next->io_sector) + return false; + return true; +} + +void iomap_ioend_try_merge(struct iomap_ioend *ioend, + struct list_head *more_ioends) +{ + struct iomap_ioend *next; + + INIT_LIST_HEAD(&ioend->io_list); + + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, + io_list))) { + if (!iomap_ioend_can_merge(ioend, next)) + break; + list_move_tail(&next->io_list, &ioend->io_list); + ioend->io_size += next->io_size; + } +} +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); + +static int iomap_ioend_compare(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); + + if (ia->io_offset < ib->io_offset) + return -1; + if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +void iomap_sort_ioends(struct list_head *ioend_list) +{ + list_sort(NULL, ioend_list, iomap_ioend_compare); +} +EXPORT_SYMBOL_GPL(iomap_sort_ioends); + +/* + * Split up to the first @max_len bytes from @ioend if the ioend covers more + * than @max_len bytes. + * + * If @is_append is set, the split will be based on the hardware limits for + * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware + * limits don't allow the entire @max_len length. + * + * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer + * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to + * switch the operation after this call, but before submitting the bio. + */ +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append) +{ + struct bio *bio = &ioend->io_bio; + struct iomap_ioend *split_ioend; + unsigned int nr_segs; + int sector_offset; + struct bio *split; + + if (is_append) { + struct queue_limits *lim = bdev_limits(bio->bi_bdev); + + max_len = min(max_len, + lim->max_zone_append_sectors << SECTOR_SHIFT); + + sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); + if (unlikely(sector_offset < 0)) + return ERR_PTR(sector_offset); + if (!sector_offset) + return NULL; + } else { + if (bio->bi_iter.bi_size <= max_len) + return NULL; + sector_offset = max_len >> SECTOR_SHIFT; + } + + /* ensure the split ioend is still block size aligned */ + sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, + i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; + + split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); + if (IS_ERR(split)) + return ERR_CAST(split); + split->bi_private = bio->bi_private; + split->bi_end_io = bio->bi_end_io; + + split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, + ioend->io_flags); + split_ioend->io_parent = ioend; + + atomic_inc(&ioend->io_remaining); + ioend->io_offset += split_ioend->io_size; + ioend->io_size -= split_ioend->io_size; + + split_ioend->io_sector = ioend->io_sector; + if (!is_append) + ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); + return split_ioend; +} +EXPORT_SYMBOL_GPL(iomap_split_ioend); + +static int __init iomap_ioend_init(void) +{ + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), + offsetof(struct iomap_ioend, io_bio), + BIOSET_NEED_BVECS); +} +fs_initcall(iomap_ioend_init); diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 3790918646af..6ffc6a7b9ba5 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,40 +7,25 @@ #include <linux/iomap.h> #include "trace.h" -/* - * Advance to the next range we need to map. - * - * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully - * processed - it was aborted because the extent the iomap spanned may have been - * changed during the operation. In this case, the iteration behaviour is to - * remap the unprocessed range of the iter, and that means we may need to remap - * even when we've made no progress (i.e. iter->processed = 0). Hence the - * "finished iterating" case needs to distinguish between - * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we - * need to remap the entire remaining range. - */ -static inline int iomap_iter_advance(struct iomap_iter *iter) +static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { - bool stale = iter->iomap.flags & IOMAP_F_STALE; - int ret = 1; - - /* handle the previous iteration (if any) */ - if (iter->iomap.length) { - if (iter->processed < 0) - return iter->processed; - if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) - return -EIO; - iter->pos += iter->processed; - iter->len -= iter->processed; - if (!iter->len || (!iter->processed && !stale)) - ret = 0; - } - - /* clear the per iteration state */ - iter->processed = 0; + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); - return ret; +} + +/* + * Advance the current iterator position and output the length remaining for the + * current mapping. + */ +int iomap_iter_advance(struct iomap_iter *iter, u64 *count) +{ + if (WARN_ON_ONCE(*count > iomap_length(iter))) + return -EIO; + iter->pos += *count; + iter->len -= *count; + *count = iomap_length(iter); + return 0; } static inline void iomap_iter_done(struct iomap_iter *iter) @@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); + iter->iter_start_pos = iter->pos; + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); @@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter) * function must be called in a loop that continues as long it returns a * positive value. If 0 or a negative value is returned, the caller must not * return to the loop body. Within a loop body, there are two ways to break out - * of the loop body: leave @iter.processed unchanged, or set it to a negative + * of the loop body: leave @iter.status unchanged, or set it to a negative * errno. */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + ssize_t advanced; + u64 olen; int ret; - if (iter->iomap.length && ops->iomap_end) { - ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), - iter->processed > 0 ? iter->processed : 0, - iter->flags, &iter->iomap); - if (ret < 0 && !iter->processed) + trace_iomap_iter(iter, ops, _RET_IP_); + + if (!iter->iomap.length) + goto begin; + + /* + * Calculate how far the iter was advanced and the original length bytes + * for ->iomap_end(). + */ + advanced = iter->pos - iter->iter_start_pos; + olen = iter->len + advanced; + + if (ops->iomap_end) { + ret = ops->iomap_end(iter->inode, iter->iter_start_pos, + iomap_length_trim(iter, iter->iter_start_pos, + olen), + advanced, iter->flags, &iter->iomap); + if (ret < 0 && !advanced) return ret; } - trace_iomap_iter(iter, ops, _RET_IP_); - ret = iomap_iter_advance(iter); + /* detect old return semantics where this would advance */ + if (WARN_ON_ONCE(iter->status > 0)) + iter->status = -EIO; + + /* + * Use iter->len to determine whether to continue onto the next mapping. + * Explicitly terminate on error status or if the current iter has not + * advanced at all (i.e. no work was done for some reason) unless the + * mapping has been marked stale and needs to be reprocessed. + */ + if (iter->status < 0) + ret = iter->status; + else if (iter->len == 0 || (!advanced && !stale)) + ret = 0; + else + ret = 1; + iomap_iter_reset_iomap(iter); if (ret <= 0) return ret; +begin: ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); if (ret < 0) diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index a845c012b50c..04d7919636c1 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,7 +10,7 @@ #include <linux/pagemap.h> #include <linux/pagevec.h> -static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, +static int iomap_seek_hole_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); @@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_HOLE); if (*hole_pos == iter->pos + length) - return length; + return iomap_iter_advance(iter, &length); return 0; case IOMAP_HOLE: *hole_pos = iter->pos; return 0; default: - return length; + return iomap_iter_advance(iter, &length); } } @@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_hole_iter(&iter, &pos); + iter.status = iomap_seek_hole_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found hole before EOF */ @@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_seek_hole); -static loff_t iomap_seek_data_iter(const struct iomap_iter *iter, +static int iomap_seek_data_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); switch (iter->iomap.type) { case IOMAP_HOLE: - return length; + return iomap_iter_advance(iter, &length); case IOMAP_UNWRITTEN: *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_DATA); if (*hole_pos < 0) - return length; + return iomap_iter_advance(iter, &length); return 0; default: *hole_pos = iter->pos; @@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_data_iter(&iter, &pos); + iter.status = iomap_seek_data_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found data before EOF */ diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index b90d0eda9e51..c1a762c10ce4 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) * swap only cares about contiguous page-aligned physical extents and makes no * distinction between written and unwritten extents. */ -static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, +static int iomap_swapfile_iter(struct iomap_iter *iter, struct iomap *iomap, struct iomap_swapfile_info *isi) { switch (iomap->type) { @@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, return error; memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); } - return iomap_length(iter); + + return iomap_iter_advance_full(iter); } /* @@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi); + iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi); if (ret < 0) return ret; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4118a42cdab0..9eab2c8ac3c5 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter, __field(u64, ino) __field(loff_t, pos) __field(u64, length) - __field(s64, processed) + __field(int, status) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter, __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); - __entry->processed = iter->processed; + __entry->status = iter->status; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, - __entry->processed, + __entry->status, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index e8e80761ac73..1c7c49356878 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -57,8 +57,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * - * Called under lock_journal(), and possibly under journal_datalist_lock. The - * caller provided us with a ref against the buffer, and we drop that here. + * Called under j_list_lock. The caller provided us with a ref against the + * buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { @@ -738,10 +738,8 @@ start_journal_io: err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING - "JBD2: Detected IO errors while flushing file data " - "on %s\n", journal->j_devname); - if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) - jbd2_journal_abort(journal, err); + "JBD2: Detected IO errors %d while flushing file data on %s\n", + err, journal->j_devname); err = 0; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d8084b31b361..a5ccba25ff47 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -603,7 +603,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) { int ret = 0; - transaction_t *commit_trans; + transaction_t *commit_trans, *running_trans; if (!(journal->j_flags & JBD2_BARRIER)) return 0; @@ -613,6 +613,16 @@ int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) goto out; commit_trans = journal->j_committing_transaction; if (!commit_trans || commit_trans->t_tid != tid) { + running_trans = journal->j_running_transaction; + /* + * The query transaction hasn't started committing, + * it must still be running. + */ + if (WARN_ON_ONCE(!running_trans || + running_trans->t_tid != tid)) + goto out; + + running_trans->t_need_data_flush = 1; ret = 1; goto out; } @@ -947,7 +957,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, * descriptor blocks we do need to generate bona fide buffers. * * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying - * the buffer's contents they really should run flush_dcache_page(bh->b_page). + * the buffer's contents they really should run flush_dcache_folio(bh->b_folio). * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ @@ -1361,7 +1371,7 @@ static int journal_check_superblock(journal_t *journal) return err; } - if (jbd2_journal_has_csum_v2or3_feature(journal) && + if (jbd2_journal_has_csum_v2or3(journal) && jbd2_has_feature_checksum(journal)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " @@ -1369,7 +1379,7 @@ static int journal_check_superblock(journal_t *journal) return err; } - if (jbd2_journal_has_csum_v2or3_feature(journal)) { + if (jbd2_journal_has_csum_v2or3(journal)) { if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); return err; @@ -1869,7 +1879,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, /* Log is no longer empty */ write_lock(&journal->j_state_lock); - WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); @@ -1965,17 +1974,15 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) return err; } - if (block_start == ~0ULL) { - block_start = phys_block; - block_stop = block_start - 1; - } + if (block_start == ~0ULL) + block_stop = block_start = phys_block; /* * last block not contiguous with current block, * process last contiguous region and return to this block on * next loop */ - if (phys_block != block_stop + 1) { + if (phys_block != block_stop) { block--; } else { block_stop++; @@ -1994,11 +2001,10 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) */ byte_start = block_start * journal->j_blocksize; byte_stop = block_stop * journal->j_blocksize; - byte_count = (block_stop - block_start + 1) * - journal->j_blocksize; + byte_count = (block_stop - block_start) * journal->j_blocksize; truncate_inode_pages_range(journal->j_dev->bd_mapping, - byte_start, byte_stop); + byte_start, byte_stop - 1); if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { err = blkdev_issue_discard(journal->j_dev, @@ -2013,7 +2019,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) } if (unlikely(err != 0)) { - pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)", err, block_start, block_stop); return err; } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 9192be7c19d8..c271a050b7e6 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -39,7 +39,7 @@ struct recovery_info static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass); -static int scan_revoke_records(journal_t *, struct buffer_head *, +static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *, tid_t, struct recovery_info *); #ifdef __KERNEL__ @@ -65,9 +65,8 @@ static void journal_brelse_array(struct buffer_head *b[], int n) */ #define MAXBUF 8 -static int do_readahead(journal_t *journal, unsigned int start) +static void do_readahead(journal_t *journal, unsigned int start) { - int err; unsigned int max, nbufs, next; unsigned long long blocknr; struct buffer_head *bh; @@ -85,7 +84,7 @@ static int do_readahead(journal_t *journal, unsigned int start) nbufs = 0; for (next = start; next < max; next++) { - err = jbd2_journal_bmap(journal, next, &blocknr); + int err = jbd2_journal_bmap(journal, next, &blocknr); if (err) { printk(KERN_ERR "JBD2: bad block at offset %u\n", @@ -94,10 +93,8 @@ static int do_readahead(journal_t *journal, unsigned int start) } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - err = -ENOMEM; + if (!bh) goto failed; - } if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; @@ -112,12 +109,10 @@ static int do_readahead(journal_t *journal, unsigned int start) if (nbufs) bh_readahead_batch(nbufs, bufs, 0); - err = 0; failed: if (nbufs) journal_brelse_array(bufs, nbufs); - return err; } #endif /* __KERNEL__ */ @@ -287,19 +282,20 @@ static int fc_do_one_pass(journal_t *journal, int jbd2_journal_recover(journal_t *journal) { int err, err2; - journal_superblock_t * sb; - struct recovery_info info; memset(&info, 0, sizeof(info)); - sb = journal->j_superblock; /* * The journal superblock's s_start field (the current log head) * is always zero if, and only if, the journal was cleanly - * unmounted. + * unmounted. We use its in-memory version j_tail here because + * jbd2_journal_wipe() could have updated it without updating journal + * superblock. */ - if (!sb->s_start) { + if (!journal->j_tail) { + journal_superblock_t *sb = journal->j_superblock; + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; @@ -327,6 +323,12 @@ int jbd2_journal_recover(journal_t *journal) journal->j_transaction_sequence, journal->j_head); jbd2_journal_clear_revoke(journal); + /* Free revoke table allocated for replay */ + if (journal->j_revoke != journal->j_revoke_table[0] && + journal->j_revoke != journal->j_revoke_table[1]) { + jbd2_journal_destroy_revoke_table(journal->j_revoke); + journal->j_revoke = journal->j_revoke_table[1]; + } err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; @@ -612,6 +614,31 @@ static int do_one_pass(journal_t *journal, first_commit_ID = next_commit_ID; if (pass == PASS_SCAN) info->start_transaction = first_commit_ID; + else if (pass == PASS_REVOKE) { + /* + * Would the default revoke table have too long hash chains + * during replay? + */ + if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) { + unsigned int hash_size; + + /* + * Aim for average chain length of 8, limit at 1M + * entries to avoid problems with malicious + * filesystems. + */ + hash_size = min(roundup_pow_of_two(info->nr_revokes / 8), + 1U << 20); + journal->j_revoke = + jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke) { + printk(KERN_ERR + "JBD2: failed to allocate revoke table for replay with %u entries. " + "Journal replay may be slow.\n", hash_size); + journal->j_revoke = journal->j_revoke_table[1]; + } + } + } jbd2_debug(1, "Starting recovery pass %d\n", pass); @@ -852,6 +879,13 @@ chksum_ok: case JBD2_REVOKE_BLOCK: /* + * If we aren't in the SCAN or REVOKE pass, then we can + * just skip over this block. + */ + if (pass != PASS_REVOKE && pass != PASS_SCAN) + continue; + + /* * Check revoke block crc in pass_scan, if csum verify * failed, check commit block time later. */ @@ -863,12 +897,7 @@ chksum_ok: need_check_commit_time = true; } - /* If we aren't in the REVOKE pass, then we can - * just skip over this block. */ - if (pass != PASS_REVOKE) - continue; - - err = scan_revoke_records(journal, bh, + err = scan_revoke_records(journal, pass, bh, next_commit_ID, info); if (err) goto failed; @@ -922,8 +951,9 @@ chksum_ok: /* Scan a revoke record, marking all blocks mentioned as revoked. */ -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, - tid_t sequence, struct recovery_info *info) +static int scan_revoke_records(journal_t *journal, enum passtype pass, + struct buffer_head *bh, tid_t sequence, + struct recovery_info *info) { jbd2_journal_revoke_header_t *header; int offset, max; @@ -944,6 +974,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, if (jbd2_has_feature_64bit(journal)) record_len = 8; + if (pass == PASS_SCAN) { + info->nr_revokes += (max - offset) / record_len; + return 0; + } + while (offset + record_len <= max) { unsigned long long blocknr; int err; @@ -956,7 +991,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, err = jbd2_journal_set_revoke(journal, blocknr, sequence); if (err) return err; - ++info->nr_revokes; } return 0; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index ce63d5fde9c3..0cf0fddbee81 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -215,7 +215,7 @@ int __init jbd2_journal_init_revoke_table_cache(void) return 0; } -static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) +struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; @@ -231,7 +231,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = - kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); + kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; @@ -245,7 +245,7 @@ out: return table; } -static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) +void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; @@ -255,7 +255,7 @@ static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) J_ASSERT(list_empty(hash_list)); } - kfree(table->hash_table); + kvfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } @@ -420,12 +420,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ -int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) +void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; - int did_revoke = 0; /* akpm: debug */ struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); @@ -450,7 +449,6 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); - did_revoke = 1; } } @@ -473,11 +471,10 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) __brelse(bh2); } } - return did_revoke; } /* - * journal_clear_revoked_flag clears revoked flag of buffers in + * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ @@ -506,9 +503,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal) } } -/* journal_switch_revoke table select j_revoke for next transaction - * we do not want to suspend any processing until all revokes are - * written -bzzz +/* jbd2_journal_switch_revoke_table table select j_revoke for next + * transaction we do not want to suspend any processing until all + * revokes are written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 66513c18ca29..cbc4785462f5 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -92,7 +92,6 @@ static void jbd2_get_transaction(journal_t *journal, atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); - INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); @@ -114,12 +113,9 @@ static void jbd2_get_transaction(journal_t *journal, */ /* - * Update transaction's maximum wait time, if debugging is enabled. - * * t_max_wait is carefully updated here with use of atomic compare exchange. * Note that there could be multiplre threads trying to do this simultaneously * hence using cmpxchg to avoid any use of locks in this case. - * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug. */ static inline void update_t_max_wait(transaction_t *transaction, unsigned long ts) @@ -2079,21 +2075,6 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) jh->b_transaction = NULL; } -void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - spin_lock(&jh->b_state_lock); - spin_lock(&journal->j_list_lock); - __jbd2_journal_unfile_buffer(jh); - spin_unlock(&journal->j_list_lock); - spin_unlock(&jh->b_state_lock); - jbd2_journal_put_journal_head(jh); - __brelse(bh); -} - /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation @@ -2192,7 +2173,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in - * __journal_file_buffer + * __jbd2_journal_file_buffer */ clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 2b2938970da3..dd91f725ded6 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -32,8 +32,8 @@ static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); static int jffs2_symlink (struct mnt_idmap *, struct inode *, struct dentry *, const char *); -static int jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *, - umode_t); +static struct dentry *jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *, + umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); static int jffs2_mknod (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); @@ -446,8 +446,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, } -static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, - struct dentry *dentry, umode_t mode) +static struct dentry *jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, + struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -464,7 +464,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, ri = jffs2_alloc_raw_inode(); if (!ri) - return -ENOMEM; + return ERR_PTR(-ENOMEM); c = JFFS2_SB_INFO(dir_i->i_sb); @@ -477,7 +477,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, if (ret) { jffs2_free_raw_inode(ri); - return ret; + return ERR_PTR(ret); } inode = jffs2_new_inode(dir_i, mode, ri); @@ -485,7 +485,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, if (IS_ERR(inode)) { jffs2_free_raw_inode(ri); jffs2_complete_reservation(c); - return PTR_ERR(inode); + return ERR_CAST(inode); } inode->i_op = &jffs2_dir_inode_operations; @@ -584,11 +584,11 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, jffs2_complete_reservation(c); d_instantiate_new(dentry, inode); - return 0; + return NULL; fail: iget_failed(inode); - return ret; + return ERR_PTR(ret); } static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 07cfdc440596..60fc92dee24d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -369,7 +369,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length) ASSERT(length >= 0); - if (test_cflag(COMMIT_Nolink, ip)) { + if (test_cflag(COMMIT_Nolink, ip) || isReadOnly(ip)) { xtTruncate(0, ip, length, COMMIT_WMAP); return; } diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index f9009e4f9ffd..26e89d0c69b6 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -178,41 +178,26 @@ int dbMount(struct inode *ipbmap) dbmp_le = (struct dbmap_disk *) mp->data; bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); - bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); - if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE || - bmp->db_l2nbperpage < 0) { - err = -EINVAL; - goto err_release_metapage; - } - bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); - if (!bmp->db_numag || bmp->db_numag > MAXAG) { - err = -EINVAL; - goto err_release_metapage; - } - bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); - if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 || - bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) { - err = -EINVAL; - goto err_release_metapage; - } - bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); - if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG || - bmp->db_agl2size < 0) { - err = -EINVAL; - goto err_release_metapage; - } - if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { + if ((bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) || + (bmp->db_l2nbperpage < 0) || + !bmp->db_numag || (bmp->db_numag > MAXAG) || + (bmp->db_maxag >= MAXAG) || (bmp->db_maxag < 0) || + (bmp->db_agpref >= MAXAG) || (bmp->db_agpref < 0) || + !bmp->db_agwidth || + (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) || + (bmp->db_agl2size < 0) || + ((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { err = -EINVAL; goto err_release_metapage; } @@ -3403,7 +3388,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) oldl2agsize = bmp->db_agl2size; bmp->db_agl2size = l2agsize; - bmp->db_agsize = 1 << l2agsize; + bmp->db_agsize = (s64)1 << l2agsize; /* compute new number of AG */ agno = bmp->db_numag; @@ -3666,8 +3651,8 @@ void dbFinalizeBmap(struct inode *ipbmap) * system size is not a multiple of the group size). */ inactfree = (inactags && ag_rem) ? - ((inactags - 1) << bmp->db_agl2size) + ag_rem - : inactags << bmp->db_agl2size; + (((s64)inactags - 1) << bmp->db_agl2size) + ag_rem + : ((s64)inactags << bmp->db_agl2size); /* determine how many free blocks are in the active * allocation groups plus the average number of free blocks diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 8f85177f284b..93db6eec4465 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -117,7 +117,8 @@ do { \ if (!(RC)) { \ if (((P)->header.nextindex > \ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ - ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ + ((BN) && (((P)->header.maxslot > DTPAGEMAXSLOT) || \ + ((P)->header.stblindex >= DTPAGEMAXSLOT)))) { \ BT_PUTPAGE(MP); \ jfs_error((IP)->i_sb, \ "DT_GETPAGE: dtree page corrupt\n"); \ diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 63d21822d309..46529bcc8297 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -74,6 +74,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) int rc; int xflag; + if (isReadOnly(ip)) { + jfs_error(ip->i_sb, "read-only filesystem\n"); + return -EIO; + } + /* This blocks if we are low on resources */ txBeginAnon(ip->i_sb); @@ -253,6 +258,11 @@ int extRecord(struct inode *ip, xad_t * xp) { int rc; + if (isReadOnly(ip)) { + jfs_error(ip->i_sb, "read-only filesystem\n"); + return -EIO; + } + txBeginAnon(ip->i_sb); mutex_lock(&JFS_IP(ip)->commit_mutex); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index a360b24ed320..ecb8e05b8b84 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -102,7 +102,7 @@ int diMount(struct inode *ipimap) * allocate/initialize the in-memory inode map control structure */ /* allocate the in-memory inode map control structure. */ - imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); + imap = kzalloc(sizeof(struct inomap), GFP_KERNEL); if (imap == NULL) return -ENOMEM; @@ -456,7 +456,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) dp += inum % 8; /* 8 inodes per 4K page */ /* copy on-disk inode to in-memory inode */ - if ((copy_from_dinode(dp, ip)) != 0) { + if ((copy_from_dinode(dp, ip) != 0) || (ip->i_nlink == 0)) { /* handle bad return by returning NULL for ip */ set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); @@ -3029,14 +3029,23 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno, * * RETURN VALUES: * 0 - success - * -ENOMEM - insufficient memory + * -EINVAL - unexpected inode type */ static int copy_from_dinode(struct dinode * dip, struct inode *ip) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int fileset = le32_to_cpu(dip->di_fileset); + + switch (fileset) { + case AGGR_RESERVED_I: case AGGREGATE_I: case BMAP_I: + case LOG_I: case BADBLOCK_I: case FILESYSTEM_I: + break; + default: + return -EINVAL; + } - jfs_ip->fileset = le32_to_cpu(dip->di_fileset); + jfs_ip->fileset = fileset; jfs_ip->mode2 = le32_to_cpu(dip->di_mode); jfs_set_inode_flags(ip); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index fc8ede43afde..65a218eba8fa 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -187,13 +187,13 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, * dentry - dentry of child directory * mode - create mode (rwxrwxrwx). * - * RETURN: Errors from subroutines + * RETURN: ERR_PTR() of errors from subroutines. * * note: * EACCES: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, - struct dentry *dentry, umode_t mode) +static struct dentry *jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, + struct dentry *dentry, umode_t mode) { int rc = 0; tid_t tid; /* transaction id */ @@ -308,7 +308,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, out1: jfs_info("jfs_mkdir: rc:%d", rc); - return rc; + return ERR_PTR(rc); } /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 223d9ac59839..10368c188c5e 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -389,8 +389,8 @@ static int jfs_reconfigure(struct fs_context *fc) if (!ctx->newLVSize) { ctx->newLVSize = sb_bdev_nr_blocks(sb); - if (ctx->newLVSize == 0) - pr_err("JFS: Cannot determine volume size\n"); + if (ctx->newLVSize == 0) + pr_err("JFS: Cannot determine volume size\n"); } rc = jfs_extendfs(sb, ctx->newLVSize, 0); @@ -766,7 +766,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 24afbae87225..11d7f74d207b 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -559,11 +559,16 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { - int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr)); - - printk(KERN_ERR "ea_get: invalid extended attribute\n"); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, - ea_buf->xattr, size, 1); + if (unlikely(EALIST_SIZE(ea_buf->xattr) > INT_MAX)) { + printk(KERN_ERR "ea_get: extended attribute size too large: %u > INT_MAX\n", + EALIST_SIZE(ea_buf->xattr)); + } else { + int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr)); + + printk(KERN_ERR "ea_get: invalid extended attribute\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, + ea_buf->xattr, size, 1); + } ea_release(inode, ea_buf); rc = -EIO; goto clean_up; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5f0f8b95f44c..d296aad70800 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1230,24 +1230,24 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, return d_splice_alias(inode, dentry); } -static int kernfs_iop_mkdir(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, - umode_t mode) +static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) - return -EPERM; + return ERR_PTR(-EPERM); if (!kernfs_get_active(parent)) - return -ENODEV; + return ERR_PTR(-ENODEV); ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); - return ret; + return ERR_PTR(ret); } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/libfs.c b/fs/libfs.c index dc042a975a56..6393d7c49ee6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2113,7 +2113,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) } EXPORT_SYMBOL(simple_inode_init_ts); -static inline struct dentry *get_stashed_dentry(struct dentry **stashed) +struct dentry *stashed_dentry_get(struct dentry **stashed) { struct dentry *dentry; @@ -2215,7 +2215,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = get_stashed_dentry(stashed); + path->dentry = stashed_dentry_get(stashed); if (path->dentry) { sops->put_data(data); goto out_path; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 5d9c1406fe27..8938536d8d3c 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -104,15 +104,15 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode * inode; int err; inode = minix_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode_inc_link_count(dir); minix_set_inode(inode, 0); @@ -128,7 +128,7 @@ static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); out: - return err; + return ERR_PTR(err); out_fail: inode_dec_link_count(inode); diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 7b1df8cc2821..a37991fdb194 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -6,6 +6,7 @@ #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/seq_file.h> #include "internal.h" @@ -334,3 +335,53 @@ void mnt_idmap_put(struct mnt_idmap *idmap) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); + +int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map) +{ + struct uid_gid_map *map, *map_up; + u32 idx, nr_mappings; + + if (!is_valid_mnt_idmap(idmap)) + return 0; + + /* + * Idmappings are shown relative to the caller's idmapping. + * This is both the most intuitive and most useful solution. + */ + if (uid_map) { + map = &idmap->uid_map; + map_up = ¤t_user_ns()->uid_map; + } else { + map = &idmap->gid_map; + map_up = ¤t_user_ns()->gid_map; + } + + for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) { + uid_t lower; + struct uid_gid_extent *extent; + + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = &map->extent[idx]; + else + extent = &map->forward[idx]; + + /* + * Verify that the whole range of the mapping can be + * resolved in the caller's idmapping. If it cannot be + * resolved skip the mapping. + */ + lower = map_id_range_up(map_up, extent->lower_first, extent->count); + if (lower == (uid_t) -1) + continue; + + seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); + + seq->count++; /* mappings are separated by \0 */ + if (seq_has_overflowed(seq)) + return -EAGAIN; + + nr_mappings++; + } + + return nr_mappings; +} diff --git a/fs/mount.h b/fs/mount.h index ffb613cdfeee..7aecf2a60472 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -5,6 +5,12 @@ #include <linux/ns_common.h> #include <linux/fs_pin.h> +extern struct list_head notify_list; + +typedef __u32 __bitwise mntns_flags_t; + +#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0)) + struct mnt_namespace { struct ns_common ns; struct mount * root; @@ -20,12 +26,18 @@ struct mnt_namespace { wait_queue_head_t poll; struct rcu_head mnt_ns_rcu; }; + u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; +#ifdef CONFIG_FSNOTIFY + __u32 n_fsnotify_mask; + struct fsnotify_mark_connector __rcu *n_fsnotify_marks; +#endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ + mntns_flags_t mntns_flags; } __randomize_layout; struct mnt_pcp { @@ -76,6 +88,8 @@ struct mount { #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; + struct list_head to_notify; /* need to queue notification */ + struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */ #endif int mnt_id; /* mount identifier, reused */ u64 mnt_id_unique; /* mount ID unique until reboot */ @@ -156,6 +170,11 @@ static inline bool mnt_ns_attached(const struct mount *mnt) return !RB_EMPTY_NODE(&mnt->mnt_node); } +static inline bool mnt_ns_empty(const struct mnt_namespace *ns) +{ + return RB_EMPTY_ROOT(&ns->mounts); +} + static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { struct mnt_namespace *ns = mnt->mnt_ns; @@ -177,3 +196,21 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } + +#ifdef CONFIG_FSNOTIFY +static inline void mnt_notify_add(struct mount *m) +{ + /* Optimize the case where there are no watches */ + if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) || + (m->prev_ns && m->prev_ns->n_fsnotify_marks)) + list_add_tail(&m->to_notify, ¬ify_list); + else + m->prev_ns = m->mnt_ns; +} +#else +static inline void mnt_notify_add(struct mount *m) +{ +} +#endif + +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/mpage.c b/fs/mpage.c index 82aecf372743..ad7844de87c3 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -107,7 +107,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, * don't make any buffers if there is only one buffer on * the folio and the folio just needs to be set up to date */ - if (inode->i_blkbits == PAGE_SHIFT && + if (inode->i_blkbits == folio_shift(folio) && buffer_uptodate(bh)) { folio_mark_uptodate(folio); return; @@ -153,7 +153,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct folio *folio = args->folio; struct inode *inode = folio->mapping->host; const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + const unsigned blocks_per_folio = folio_size(folio) >> blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head *map_bh = &args->map_bh; sector_t block_in_file; @@ -161,7 +161,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) sector_t last_block_in_file; sector_t first_block; unsigned page_block; - unsigned first_hole = blocks_per_page; + unsigned first_hole = blocks_per_folio; struct block_device *bdev = NULL; int length; int fully_mapped = 1; @@ -170,9 +170,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) unsigned relative_block; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); - /* MAX_BUF_PER_PAGE, for example */ - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - if (args->is_readahead) { opf |= REQ_RAHEAD; gfp |= __GFP_NORETRY | __GFP_NOWARN; @@ -181,8 +178,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) if (folio_buffers(folio)) goto confused; - block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); - last_block = block_in_file + args->nr_pages * blocks_per_page; + block_in_file = folio_pos(folio) >> blkbits; + last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits); last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -204,7 +201,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) clear_buffer_mapped(map_bh); break; } - if (page_block == blocks_per_page) + if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -216,7 +213,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * Then do more get_blocks calls until we are done with this folio. */ map_bh->b_folio = folio; - while (page_block < blocks_per_page) { + while (page_block < blocks_per_folio) { map_bh->b_state = 0; map_bh->b_size = 0; @@ -229,7 +226,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) if (!buffer_mapped(map_bh)) { fully_mapped = 0; - if (first_hole == blocks_per_page) + if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; @@ -247,7 +244,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) goto confused; } - if (first_hole != blocks_per_page) + if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ @@ -260,7 +257,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; - } else if (page_block == blocks_per_page) + } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -268,8 +265,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) bdev = map_bh->b_bdev; } - if (first_hole != blocks_per_page) { - folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); + if (first_hole != blocks_per_folio) { + folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { folio_mark_uptodate(folio); folio_unlock(folio); @@ -303,10 +300,10 @@ alloc_new: relative_block = block_in_file - args->first_logical_block; nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || - (first_hole != blocks_per_page)) + (first_hole != blocks_per_folio)) args->bio = mpage_bio_submit_read(args->bio); else - args->last_block_in_bio = first_block + blocks_per_page - 1; + args->last_block_in_bio = first_block + blocks_per_folio - 1; out: return args->bio; @@ -385,7 +382,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block) { struct mpage_readpage_args args = { .folio = folio, - .nr_pages = 1, + .nr_pages = folio_nr_pages(folio), .get_block = get_block, }; @@ -456,12 +453,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + const unsigned blocks_per_folio = folio_size(folio) >> blkbits; sector_t last_block; sector_t block_in_file; sector_t first_block; unsigned page_block; - unsigned first_unmapped = blocks_per_page; + unsigned first_unmapped = blocks_per_folio; struct block_device *bdev = NULL; int boundary = 0; sector_t boundary_block = 0; @@ -486,12 +483,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, */ if (buffer_dirty(bh)) goto confused; - if (first_unmapped == blocks_per_page) + if (first_unmapped == blocks_per_folio) first_unmapped = page_block; continue; } - if (first_unmapped != blocks_per_page) + if (first_unmapped != blocks_per_folio) goto confused; /* hole -> non-hole */ if (!buffer_dirty(bh) || !buffer_uptodate(bh)) @@ -527,7 +524,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, * The page has no buffers: map it to disk */ BUG_ON(!folio_test_uptodate(folio)); - block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); + block_in_file = folio_pos(folio) >> blkbits; /* * Whole page beyond EOF? Skip allocating blocks to avoid leaking * space. @@ -536,7 +533,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, goto page_is_mapped; last_block = (i_size - 1) >> blkbits; map_bh.b_folio = folio; - for (page_block = 0; page_block < blocks_per_page; ) { + for (page_block = 0; page_block < blocks_per_folio; ) { map_bh.b_state = 0; map_bh.b_size = 1 << blkbits; @@ -618,14 +615,14 @@ alloc_new: BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); folio_unlock(folio); - if (boundary || (first_unmapped != blocks_per_page)) { + if (boundary || (first_unmapped != blocks_per_folio)) { bio = mpage_bio_submit_write(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); } } else { - mpd->last_block_in_bio = first_block + blocks_per_page - 1; + mpd->last_block_in_bio = first_block + blocks_per_folio - 1; } goto out; diff --git a/fs/namei.c b/fs/namei.c index ecb7b95c2ca3..360a86ca1f02 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -125,6 +125,13 @@ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) +static inline void initname(struct filename *name) +{ + name->uptr = NULL; + name->aname = NULL; + atomic_set(&name->refcnt, 1); +} + struct filename * getname_flags(const char __user *filename, int flags) { @@ -203,10 +210,7 @@ getname_flags(const char __user *filename, int flags) return ERR_PTR(-ENAMETOOLONG); } } - - atomic_set(&result->refcnt, 1); - result->uptr = filename; - result->aname = NULL; + initname(result); audit_getname(result); return result; } @@ -218,11 +222,6 @@ struct filename *getname_uflags(const char __user *filename, int uflags) return getname_flags(filename, flags); } -struct filename *getname(const char __user * filename) -{ - return getname_flags(filename, 0); -} - struct filename *__getname_maybe_null(const char __user *pathname) { struct filename *name; @@ -269,25 +268,27 @@ struct filename *getname_kernel(const char * filename) return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); - result->uptr = NULL; - result->aname = NULL; - atomic_set(&result->refcnt, 1); + initname(result); audit_getname(result); - return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { + int refcnt; + if (IS_ERR_OR_NULL(name)) return; - if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) - return; + refcnt = atomic_read(&name->refcnt); + if (refcnt != 1) { + if (WARN_ON_ONCE(!refcnt)) + return; - if (!atomic_dec_and_test(&name->refcnt)) - return; + if (!atomic_dec_and_test(&name->refcnt)) + return; + } if (name->name != name->iname) { __putname(name->name); @@ -1670,6 +1671,8 @@ static struct dentry *lookup_dcache(const struct qstr *name, * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. + * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. + * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. */ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, @@ -1680,7 +1683,7 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct inode *dir = base->d_inode; if (dentry) - return dentry; + goto found; /* Don't create child dentry for a dead directory. */ if (unlikely(IS_DEADDIR(dir))) @@ -1695,6 +1698,17 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, dput(dentry); dentry = old; } +found: + if (IS_ERR(dentry)) + return dentry; + if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { + dput(dentry); + return ERR_PTR(-ENOENT); + } + if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) { + dput(dentry); + return ERR_PTR(-EEXIST); + } return dentry; } EXPORT_SYMBOL(lookup_one_qstr_excl); @@ -2863,15 +2877,14 @@ static int lookup_one_common(struct mnt_idmap *idmap, * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * - * The caller must hold base->i_mutex. + * No locks need be held - only a counted reference to @base is needed. + * */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; - WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -3415,6 +3428,8 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path, if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; + default: + VFS_BUG_ON_INODE(1, inode); } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); @@ -3995,7 +4010,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + fput_close(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -4078,27 +4093,13 @@ static struct dentry *filename_create(int dfd, struct filename *name, * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) - create_flags = 0; + create_flags &= ~LOOKUP_CREATE; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; - error = -EEXIST; - if (d_is_positive(dentry)) - goto fail; - - /* - * Special case - lookup gave negative, but... we had foo/bar/ - * From the vfs_mknod() POV we just have a negative dentry - - * all is fine. Let's be bastards - you had / on the end, you've - * been asking for (non-existent) directory. -ENOENT for you. - */ - if (unlikely(!create_flags)) { - error = -ENOENT; - goto fail; - } if (unlikely(err2)) { error = err2; goto fail; @@ -4129,7 +4130,8 @@ EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { - dput(dentry); + if (!IS_ERR(dentry)) + dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); @@ -4275,7 +4277,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d } /** - * vfs_mkdir - create directory + * vfs_mkdir - create directory returning correct dentry if possible * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory @@ -4288,32 +4290,51 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. + * + * In the event that the filesystem does not use the *@dentry but leaves it + * negative or unhashes it and possibly splices a different one returning it, + * the original dentry is dput() and the alternate is returned. + * + * In case of an error the dentry is dput() and an ERR_PTR() is returned. */ -int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; unsigned max_links = dir->i_sb->s_max_links; + struct dentry *de; error = may_create(idmap, dir, dentry); if (error) - return error; + goto err; + error = -EPERM; if (!dir->i_op->mkdir) - return -EPERM; + goto err; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) - return error; + goto err; + error = -EMLINK; if (max_links && dir->i_nlink >= max_links) - return -EMLINK; + goto err; - error = dir->i_op->mkdir(idmap, dir, dentry, mode); - if (!error) - fsnotify_mkdir(dir, dentry); - return error; + de = dir->i_op->mkdir(idmap, dir, dentry, mode); + error = PTR_ERR(de); + if (IS_ERR(de)) + goto err; + if (de) { + dput(dentry); + dentry = de; + } + fsnotify_mkdir(dir, dentry); + return dentry; + +err: + dput(dentry); + return ERR_PTR(error); } EXPORT_SYMBOL(vfs_mkdir); @@ -4333,8 +4354,10 @@ retry: error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { - error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); + if (IS_ERR(dentry)) + error = PTR_ERR(dentry); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { @@ -4445,10 +4468,6 @@ retry: error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; - if (!dentry->d_inode) { - error = -ENOENT; - goto exit4; - } error = security_path_rmdir(&path, dentry); if (error) goto exit4; @@ -4579,7 +4598,7 @@ retry_deleg: if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (last.name[last.len] || d_is_negative(dentry)) + if (last.name[last.len]) goto slashes; inode = dentry->d_inode; ihold(inode); @@ -4613,9 +4632,7 @@ exit1: return error; slashes: - if (d_is_negative(dentry)) - error = -ENOENT; - else if (d_is_dir(dentry)) + if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; @@ -5115,7 +5132,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; - unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; + unsigned int lookup_flags = 0, target_flags = + LOOKUP_RENAME_TARGET | LOOKUP_CREATE; bool should_retry = false; int error = -EINVAL; @@ -5128,6 +5146,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, if (flags & RENAME_EXCHANGE) target_flags = 0; + if (flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, @@ -5169,23 +5189,12 @@ retry_deleg: error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; - /* source must exist */ - error = -ENOENT; - if (d_is_negative(old_dentry)) - goto exit4; new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; - error = -EEXIST; - if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) - goto exit5; if (flags & RENAME_EXCHANGE) { - error = -ENOENT; - if (d_is_negative(new_dentry)) - goto exit5; - if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) diff --git a/fs/namespace.c b/fs/namespace.c index 8f1000f9f3df..6100e5b962a6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -81,15 +81,23 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); +#ifdef CONFIG_FSNOTIFY +LIST_HEAD(notify_list); /* protected by namespace_sem */ +#endif static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ +enum mount_kattr_flags_t { + MOUNT_KATTR_RECURSE = (1 << 0), + MOUNT_KATTR_IDMAP_REPLACE = (1 << 1), +}; + struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; unsigned int propagation; unsigned int lookup_flags; - bool recurse; + enum mount_kattr_flags_t kflags; struct user_namespace *mnt_userns; struct mnt_idmap *mnt_idmap; }; @@ -163,6 +171,7 @@ static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ if (refcount_dec_and_test(&ns->passive)) { + fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); } @@ -998,6 +1007,17 @@ static inline int check_mnt(struct mount *mnt) return mnt->mnt_ns == current->nsproxy->mnt_ns; } +static inline bool check_anonymous_mnt(struct mount *mnt) +{ + u64 seq; + + if (!is_anon_ns(mnt->mnt_ns)) + return false; + + seq = mnt->mnt_ns->seq_origin; + return !seq || (seq == current->nsproxy->mnt_ns->seq); +} + /* * vfsmount lock must be held for write */ @@ -1176,6 +1196,8 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); + + mnt_notify_add(mnt); } /* @@ -1723,6 +1745,50 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); +#ifdef CONFIG_FSNOTIFY +static void mnt_notify(struct mount *p) +{ + if (!p->prev_ns && p->mnt_ns) { + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } else if (p->prev_ns && !p->mnt_ns) { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + } else if (p->prev_ns == p->mnt_ns) { + fsnotify_mnt_move(p->mnt_ns, &p->mnt); + } else { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } + p->prev_ns = p->mnt_ns; +} + +static void notify_mnt_list(void) +{ + struct mount *m, *tmp; + /* + * Notify about mounts that were added/reparented/detached/remain + * connected after unmount. + */ + list_for_each_entry_safe(m, tmp, ¬ify_list, to_notify) { + mnt_notify(m); + list_del_init(&m->to_notify); + } +} + +static bool need_notify_mnt_list(void) +{ + return !list_empty(¬ify_list); +} +#else +static void notify_mnt_list(void) +{ +} + +static bool need_notify_mnt_list(void) +{ + return false; +} +#endif + static void namespace_unlock(void) { struct hlist_head head; @@ -1733,7 +1799,18 @@ static void namespace_unlock(void) hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); - up_write(&namespace_sem); + if (need_notify_mnt_list()) { + /* + * No point blocking out concurrent readers while notifications + * are sent. This will also allow statmount()/listmount() to run + * concurrently. + */ + downgrade_write(&namespace_sem); + notify_mnt_list(); + up_read(&namespace_sem); + } else { + up_write(&namespace_sem); + } shrink_dentry_list(&list); @@ -1846,6 +1923,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); + + /* + * At this point p->mnt_ns is NULL, notification will be queued + * only if + * + * - p->prev_ns is non-NULL *and* + * - p->prev_ns->n_fsnotify_marks is non-NULL + * + * This will preclude queuing the mount if this is a cleanup + * after a failed copy_tree() or destruction of an anonymous + * namespace, etc. + */ + mnt_notify_add(p); } } @@ -2026,6 +2116,7 @@ static void warn_mandlock(void) static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); + struct super_block *sb = path->dentry->d_sb; if (!may_mount()) return -EPERM; @@ -2035,7 +2126,7 @@ static int can_umount(const struct path *path, int flags) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; - if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } @@ -2145,16 +2236,24 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr } } +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry) +{ + if (!is_mnt_ns_file(dentry)) + return NULL; + + return to_mnt_ns(get_proc_ns(dentry->d_inode)); +} + static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ - struct mnt_namespace *mnt_ns; - if (!is_mnt_ns_file(dentry)) + struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry); + + if (!mnt_ns) return false; - mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } @@ -2246,22 +2345,75 @@ struct vfsmount *collect_mounts(const struct path *path) static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); +static inline bool must_dissolve(struct mnt_namespace *mnt_ns) +{ + /* + * This mount belonged to an anonymous mount namespace + * but was moved to a non-anonymous mount namespace and + * then unmounted. + */ + if (unlikely(!mnt_ns)) + return false; + + /* + * This mount belongs to a non-anonymous mount namespace + * and we know that such a mount can never transition to + * an anonymous mount namespace again. + */ + if (!is_anon_ns(mnt_ns)) { + /* + * A detached mount either belongs to an anonymous mount + * namespace or a non-anonymous mount namespace. It + * should never belong to something purely internal. + */ + VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL); + return false; + } + + return true; +} + void dissolve_on_fput(struct vfsmount *mnt) { struct mnt_namespace *ns; - namespace_lock(); - lock_mount_hash(); - ns = real_mount(mnt)->mnt_ns; - if (ns) { - if (is_anon_ns(ns)) - umount_tree(real_mount(mnt), UMOUNT_CONNECTED); - else - ns = NULL; + struct mount *m = real_mount(mnt); + + scoped_guard(rcu) { + if (!must_dissolve(READ_ONCE(m->mnt_ns))) + return; } - unlock_mount_hash(); - namespace_unlock(); - if (ns) - free_mnt_ns(ns); + + scoped_guard(rwsem_write, &namespace_sem) { + ns = m->mnt_ns; + if (!must_dissolve(ns)) + return; + + /* + * After must_dissolve() we know that this is a detached + * mount in an anonymous mount namespace. + * + * Now when mnt_has_parent() reports that this mount + * tree has a parent, we know that this anonymous mount + * tree has been moved to another anonymous mount + * namespace. + * + * So when closing this file we cannot unmount the mount + * tree. This will be done when the file referring to + * the root of the anonymous mount namespace will be + * closed (It could already be closed but it would sync + * on @namespace_sem and wait for us to finish.). + */ + if (mnt_has_parent(m)) + return; + + lock_mount_hash(); + umount_tree(m, UMOUNT_CONNECTED); + unlock_mount_hash(); + } + + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); + free_mnt_ns(ns); } void drop_collected_mounts(struct vfsmount *mnt) @@ -2287,6 +2439,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +/* + * Check that there aren't references to earlier/same mount namespaces in the + * specified subtree. Such references can act as pins for mount namespaces + * that aren't checked by the mount-cycle checking code, thereby allowing + * cycles to be made. + */ +static bool check_for_nsfs_mounts(struct mount *subtree) +{ + struct mount *p; + bool ret = false; + + lock_mount_hash(); + for (p = subtree; p; p = next_mnt(p, subtree)) + if (mnt_ns_loop(p->mnt.mnt_root)) + goto out; + + ret = true; +out: + unlock_mount_hash(); + return ret; +} + /** * clone_private_mount - create a private clone of a path * @path: path to clone @@ -2295,6 +2469,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * + * This assumes caller has called or done the equivalent of may_mount(). + * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) @@ -2302,30 +2478,36 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - down_read(&namespace_sem); + scoped_guard(rwsem_read, &namespace_sem) if (IS_MNT_UNBINDABLE(old_mnt)) - goto invalid; + return ERR_PTR(-EINVAL); - if (!check_mnt(old_mnt)) - goto invalid; + if (mnt_has_parent(old_mnt)) { + if (!check_mnt(old_mnt)) + return ERR_PTR(-EINVAL); + } else { + if (!is_mounted(&old_mnt->mnt)) + return ERR_PTR(-EINVAL); + + /* Make sure this isn't something purely kernel internal. */ + if (!is_anon_ns(old_mnt->mnt_ns)) + return ERR_PTR(-EINVAL); + + /* Make sure we don't create mount namespace loops. */ + if (!check_for_nsfs_mounts(old_mnt)) + return ERR_PTR(-EINVAL); + } if (has_locked_children(old_mnt, path->dentry)) - goto invalid; + return ERR_PTR(-EINVAL); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); - up_read(&namespace_sem); - if (IS_ERR(new_mnt)) - return ERR_CAST(new_mnt); + return ERR_PTR(-EINVAL); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; - return &new_mnt->mnt; - -invalid: - up_read(&namespace_sem); - return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -2424,6 +2606,7 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) enum mnt_tree_flags_t { MNT_TREE_MOVE = BIT(0), MNT_TREE_BENEATH = BIT(1), + MNT_TREE_PROPAGATION = BIT(2), }; /** @@ -2547,6 +2730,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, dest_mp = smp; unhash_mnt(source_mnt); attach_mnt(source_mnt, top_mnt, dest_mp, beneath); + mnt_notify_add(source_mnt); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { @@ -2773,6 +2957,71 @@ static int do_change_type(struct path *path, int ms_flags) return err; } +/* may_copy_tree() - check if a mount tree can be copied + * @path: path to the mount tree to be copied + * + * This helper checks if the caller may copy the mount tree starting + * from @path->mnt. The caller may copy the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller tries to copy an nfs mount referring to a mount + * namespace, i.e., the caller is trying to copy a mount namespace + * entry from nsfs. + * (3) The caller tries to copy a pidfs mount referring to a pidfd. + * (4) The caller is trying to copy a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * This is not strictly necessary. The current semantics of the new + * mount api enforce that the caller must be located in the same + * mount namespace as the mount tree it interacts with. Using the + * origin sequence number preserves these semantics even for + * anonymous mount namespaces. However, one could envision extending + * the api to directly operate across mount namespace if needed. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to copy the mount tree. + * + * Returns true if the mount tree can be copied, false otherwise. + */ +static inline bool may_copy_tree(struct path *path) +{ + struct mount *mnt = real_mount(path->mnt); + const struct dentry_operations *d_op; + + if (check_mnt(mnt)) + return true; + + d_op = path->dentry->d_op; + if (d_op == &ns_dentry_operations) + return true; + + if (d_op == &pidfs_dentry_operations) + return true; + + if (!is_mounted(path->mnt)) + return false; + + return check_anonymous_mnt(mnt); +} + + static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); @@ -2780,13 +3029,8 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) if (IS_MNT_UNBINDABLE(old)) return mnt; - if (!check_mnt(old)) { - const struct dentry_operations *d_op = old_path->dentry->d_op; - - if (d_op != &ns_dentry_operations && - d_op != &pidfs_dentry_operations) - return mnt; - } + if (!may_copy_tree(old_path)) + return mnt; if (!recurse && has_locked_children(old, old_path->dentry)) return mnt; @@ -2853,15 +3097,30 @@ out: static struct file *open_detached_copy(struct path *path, bool recursive) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); + struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; + struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; struct file *file; + ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) return ERR_CAST(ns); namespace_lock(); + + /* + * Record the sequence number of the source mount namespace. + * This needs to hold namespace_sem to ensure that the mount + * doesn't get attached. + */ + if (is_mounted(path->mnt)) { + src_mnt_ns = real_mount(path->mnt)->mnt_ns; + if (is_anon_ns(src_mnt_ns)) + ns->seq_origin = src_mnt_ns->seq_origin; + else + ns->seq_origin = src_mnt_ns->seq; + } + mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { namespace_unlock(); @@ -2889,24 +3148,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive) return file; } -SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { - struct file *file; - struct path path; + int ret; + struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; bool detached = flags & OPEN_TREE_CLONE; - int error; - int fd; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)) - return -EINVAL; + return ERR_PTR(-EINVAL); if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) - return -EINVAL; + return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; @@ -2916,27 +3173,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl lookup_flags |= LOOKUP_EMPTY; if (detached && !may_mount()) - return -EPERM; + return ERR_PTR(-EPERM); + + ret = user_path_at(dfd, filename, lookup_flags, &path); + if (unlikely(ret)) + return ERR_PTR(ret); + + if (detached) + return open_detached_copy(&path, flags & AT_RECURSIVE); + + return dentry_open(&path, O_PATH, current_cred()); +} + +SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +{ + int fd; + struct file *file __free(fput) = NULL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; - error = user_path_at(dfd, filename, lookup_flags, &path); - if (unlikely(error)) { - file = ERR_PTR(error); - } else { - if (detached) - file = open_detached_copy(&path, flags & AT_RECURSIVE); - else - file = dentry_open(&path, O_PATH, current_cred()); - path_put(&path); - } - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - fd_install(fd, file); + fd_install(fd, no_free_ptr(file)); return fd; } @@ -3123,28 +3385,6 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -/* - * Check that there aren't references to earlier/same mount namespaces in the - * specified subtree. Such references can act as pins for mount namespaces - * that aren't checked by the mount-cycle checking code, thereby allowing - * cycles to be made. - */ -static bool check_for_nsfs_mounts(struct mount *subtree) -{ - struct mount *p; - bool ret = false; - - lock_mount_hash(); - for (p = subtree; p; p = next_mnt(p, subtree)) - if (mnt_ns_loop(p->mnt.mnt_root)) - goto out; - - ret = true; -out: - unlock_mount_hash(); - return ret; -} - static int do_set_group(struct path *from_path, struct path *to_path) { struct mount *from, *to; @@ -3320,8 +3560,56 @@ static int can_move_mount_beneath(const struct path *from, return 0; } -static int do_move_mount(struct path *old_path, struct path *new_path, - bool beneath) +/* may_use_mount() - check if a mount tree can be used + * @mnt: vfsmount to be used + * + * This helper checks if the caller may use the mount tree starting + * from @path->mnt. The caller may use the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller is trying to use a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to use the mount tree. + * + * Returns true if the mount tree can be used, false otherwise. + */ +static inline bool may_use_mount(struct mount *mnt) +{ + if (check_mnt(mnt)) + return true; + + /* + * Make sure that noone unmounted the target path or somehow + * managed to get their hands on something purely kernel + * internal. + */ + if (!is_mounted(&mnt->mnt)) + return false; + + return check_anonymous_mnt(mnt); +} + +static int do_move_mount(struct path *old_path, + struct path *new_path, enum mnt_tree_flags_t flags) { struct mnt_namespace *ns; struct mount *p; @@ -3329,8 +3617,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path, struct mount *parent; struct mountpoint *mp, *old_mp; int err; - bool attached; - enum mnt_tree_flags_t flags = 0; + bool attached, beneath = flags & MNT_TREE_BENEATH; mp = do_lock_mount(new_path, beneath); if (IS_ERR(mp)) @@ -3346,8 +3633,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path, ns = old->mnt_ns; err = -EINVAL; - /* The mountpoint must be in our namespace. */ - if (!check_mnt(p)) + if (!may_use_mount(p)) goto out; /* The thing moved must be mounted... */ @@ -3358,6 +3644,32 @@ static int do_move_mount(struct path *old_path, struct path *new_path, if (!(attached ? check_mnt(old) : is_anon_ns(ns))) goto out; + if (is_anon_ns(ns)) { + /* + * Ending up with two files referring to the root of the + * same anonymous mount namespace would cause an error + * as this would mean trying to move the same mount + * twice into the mount tree which would be rejected + * later. But be explicit about it right here. + */ + if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns)) + goto out; + + /* + * If this is an anonymous mount tree ensure that mount + * propagation can detect mounts that were just + * propagated to the target mount tree so we don't + * propagate onto them. + */ + ns->mntns_flags |= MNTNS_PROPAGATING; + } else if (is_anon_ns(p->mnt_ns)) { + /* + * Don't allow moving an attached mount tree to an + * anonymous mount tree. + */ + goto out; + } + if (old->mnt.mnt_flags & MNT_LOCKED) goto out; @@ -3400,6 +3712,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path, if (err) goto out; + if (is_anon_ns(ns)) + ns->mntns_flags &= ~MNTNS_PROPAGATING; + /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); @@ -3408,10 +3723,13 @@ static int do_move_mount(struct path *old_path, struct path *new_path, out: unlock_mount(mp); if (!err) { - if (attached) + if (attached) { mntput_no_expire(parent); - else + } else { + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); free_mnt_ns(ns); + } } return err; } @@ -3428,7 +3746,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path, false); + err = do_move_mount(&old_path, path, 0); path_put(&old_path); return err; } @@ -4269,6 +4587,21 @@ err_unlock: return ret; } +static inline int vfs_move_mount(struct path *from_path, struct path *to_path, + enum mnt_tree_flags_t mflags) +{ + int ret; + + ret = security_move_mount(from_path, to_path); + if (ret) + return ret; + + if (mflags & MNT_TREE_PROPAGATION) + return do_set_group(from_path, to_path); + + return do_move_mount(from_path, to_path, mflags); +} + /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination @@ -4282,8 +4615,12 @@ SYSCALL_DEFINE5(move_mount, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { - struct path from_path, to_path; - unsigned int lflags; + struct path to_path __free(path_put) = {}; + struct path from_path __free(path_put) = {}; + struct filename *to_name __free(putname) = NULL; + struct filename *from_name __free(putname) = NULL; + unsigned int lflags, uflags; + enum mnt_tree_flags_t mflags = 0; int ret = 0; if (!may_mount()) @@ -4296,43 +4633,53 @@ SYSCALL_DEFINE5(move_mount, (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) return -EINVAL; - /* If someone gives a pathname, they aren't permitted to move - * from an fd that requires unmount as we can't get at the flag - * to clear it afterwards. - */ + if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; + if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; + lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; - - ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); - if (ret < 0) - return ret; + uflags = 0; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; + from_name = getname_maybe_null(from_pathname, uflags); + if (IS_ERR(from_name)) + return PTR_ERR(from_name); lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + uflags = 0; + if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; + to_name = getname_maybe_null(to_pathname, uflags); + if (IS_ERR(to_name)) + return PTR_ERR(to_name); + + if (!to_name && to_dfd >= 0) { + CLASS(fd_raw, f_to)(to_dfd); + if (fd_empty(f_to)) + return -EBADF; + + to_path = fd_file(f_to)->f_path; + path_get(&to_path); + } else { + ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); + if (ret) + return ret; + } - ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); - if (ret < 0) - goto out_from; + if (!from_name && from_dfd >= 0) { + CLASS(fd_raw, f_from)(from_dfd); + if (fd_empty(f_from)) + return -EBADF; - ret = security_move_mount(&from_path, &to_path); - if (ret < 0) - goto out_to; + return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); + } - if (flags & MOVE_MOUNT_SET_GROUP) - ret = do_set_group(&from_path, &to_path); - else - ret = do_move_mount(&from_path, &to_path, - (flags & MOVE_MOUNT_BENEATH)); + ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); + if (ret) + return ret; -out_to: - path_put(&to_path); -out_from: - path_put(&from_path); - return ret; + return vfs_move_mount(&from_path, &to_path, mflags); } /* @@ -4468,6 +4815,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, list_del_init(&new_mnt->mnt_expire); put_mountpoint(root_mp); unlock_mount_hash(); + mnt_notify_add(root_mnt); + mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); error = 0; out4: @@ -4512,11 +4861,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) return -EINVAL; /* - * Once a mount has been idmapped we don't allow it to change its - * mapping. It makes things simpler and callers can just create - * another bind-mount they can idmap if they want to. + * We only allow an mount to change it's idmapping if it has + * never been accessible to userspace. */ - if (is_idmapped_mnt(m)) + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ @@ -4576,7 +4924,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) break; } - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) return 0; } @@ -4606,18 +4954,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { + struct mnt_idmap *old_idmap; + if (!kattr->mnt_idmap) return; - /* - * Pairs with smp_load_acquire() in mnt_idmap(). - * - * Since we only allow a mount to change the idmapping once and - * verified this in can_idmap_mount() we know that the mount has - * @nop_mnt_idmap attached to it. So there's no need to drop any - * references. - */ + old_idmap = mnt_idmap(&mnt->mnt); + + /* Pairs with smp_load_acquire() in mnt_idmap(). */ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); + mnt_idmap_put(old_idmap); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) @@ -4637,7 +4983,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) break; } touch_mnt_namespace(mnt->mnt_ns); @@ -4667,7 +5013,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) */ namespace_lock(); if (kattr->propagation == MS_SHARED) { - err = invent_group_ids(mnt, kattr->recurse); + err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE); if (err) { namespace_unlock(); return err; @@ -4718,7 +5064,7 @@ out: } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { struct ns_common *ns; struct user_namespace *mnt_userns; @@ -4726,13 +5072,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; - /* - * We currently do not support clearing an idmapped mount. If this ever - * is a use-case we can revisit this but for now let's keep it simple - * and not allow it. - */ - if (attr->attr_clr & MOUNT_ATTR_IDMAP) - return -EINVAL; + if (attr->attr_clr & MOUNT_ATTR_IDMAP) { + /* + * We can only remove an idmapping if it's never been + * exposed to userspace. + */ + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE)) + return -EINVAL; + + /* + * Removal of idmappings is equivalent to setting + * nop_mnt_idmap. + */ + if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) { + kattr->mnt_idmap = &nop_mnt_idmap; + return 0; + } + } if (attr->userns_fd > INT_MAX) return -EINVAL; @@ -4769,22 +5125,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { - unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; - - if (flags & AT_NO_AUTOMOUNT) - lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_SYMLINK_NOFOLLOW) - lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - - *kattr = (struct mount_kattr) { - .lookup_flags = lookup_flags, - .recurse = !!(flags & AT_RECURSIVE), - }; - if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) return -EINVAL; if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) @@ -4832,35 +5174,28 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize, return -EINVAL; } - return build_mount_idmapped(attr, usize, kattr, flags); + return build_mount_idmapped(attr, usize, kattr); } static void finish_mount_kattr(struct mount_kattr *kattr) { - put_user_ns(kattr->mnt_userns); - kattr->mnt_userns = NULL; + if (kattr->mnt_userns) { + put_user_ns(kattr->mnt_userns); + kattr->mnt_userns = NULL; + } if (kattr->mnt_idmap) mnt_idmap_put(kattr->mnt_idmap); } -SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, - unsigned int, flags, struct mount_attr __user *, uattr, - size_t, usize) +static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize, + struct mount_kattr *kattr) { - int err; - struct path target; + int ret; struct mount_attr attr; - struct mount_kattr kattr; BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); - if (flags & ~(AT_EMPTY_PATH | - AT_RECURSIVE | - AT_SYMLINK_NOFOLLOW | - AT_NO_AUTOMOUNT)) - return -EINVAL; - if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) @@ -4869,9 +5204,9 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, if (!may_mount()) return -EPERM; - err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); - if (err) - return err; + ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); + if (ret) + return ret; /* Don't bother walking through the mounts if this is a nop. */ if (attr.attr_set == 0 && @@ -4879,7 +5214,39 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, attr.propagation == 0) return 0; - err = build_mount_kattr(&attr, usize, &kattr, flags); + return build_mount_kattr(&attr, usize, kattr); +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, + unsigned int, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + int err; + struct path target; + struct mount_kattr kattr; + unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + + if (flags & ~(AT_EMPTY_PATH | + AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | + AT_NO_AUTOMOUNT)) + return -EINVAL; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + kattr = (struct mount_kattr) { + .lookup_flags = lookup_flags, + }; + + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; + + err = copy_mount_setattr(uattr, usize, &kattr); if (err) return err; @@ -4892,6 +5259,47 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; } +SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, + unsigned, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + struct file __free(fput) *file = NULL; + int fd; + + if (!uattr && usize) + return -EINVAL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (uattr) { + int ret; + struct mount_kattr kattr = {}; + + kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; + + ret = copy_mount_setattr(uattr, usize, &kattr); + if (ret) + return ret; + + ret = do_mount_setattr(&file->f_path, &kattr); + if (ret) + return ret; + + finish_mount_kattr(&kattr); + } + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + fd_install(fd, no_free_ptr(file)); + return fd; +} + int show_path(struct seq_file *m, struct dentry *root) { if (root->d_sb->s_op->show_path) @@ -4915,6 +5323,7 @@ struct kstatmount { struct statmount __user *buf; size_t bufsize; struct vfsmount *mnt; + struct mnt_idmap *idmap; u64 mask; struct path root; struct statmount sm; @@ -5184,6 +5593,46 @@ static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, true); + if (ret < 0) + return ret; + + s->sm.mnt_uidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_UIDMAP; + return 0; +} + +static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, false); + if (ret < 0) + return ret; + + s->sm.mnt_gidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_GIDMAP; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; @@ -5231,6 +5680,14 @@ static int statmount_string(struct kstatmount *s, u64 flag) offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; + case STATMOUNT_MNT_UIDMAP: + sm->mnt_uidmap = start; + ret = statmount_mnt_uidmap(s, seq); + break; + case STATMOUNT_MNT_GIDMAP: + sm->mnt_gidmap = start; + ret = statmount_mnt_gidmap(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -5306,7 +5763,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) * We have to find the first mount in our ns and use that, however it * may not exist, so handle that properly. */ - if (RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_empty(ns)) return -ENOENT; first = child = ns->root; @@ -5323,6 +5780,21 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) return 0; } +/* This must be updated whenever a new flag is added */ +#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \ + STATMOUNT_MNT_BASIC | \ + STATMOUNT_PROPAGATE_FROM | \ + STATMOUNT_MNT_ROOT | \ + STATMOUNT_MNT_POINT | \ + STATMOUNT_FS_TYPE | \ + STATMOUNT_MNT_NS_ID | \ + STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | \ + STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY | \ + STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_SUPPORTED_MASK) + static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { @@ -5331,7 +5803,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, int err; /* Has the namespace already been emptied? */ - if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_id && mnt_ns_empty(ns)) return -ENOENT; s->mnt = lookup_mnt_in_ns(mnt_id, ns); @@ -5356,6 +5828,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, return err; s->root = root; + s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_SB_BASIC) statmount_sb_basic(s); @@ -5389,12 +5862,26 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_SB_SOURCE) err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_UIDMAP) + err = statmount_string(s, STATMOUNT_MNT_UIDMAP); + + if (!err && s->mask & STATMOUNT_MNT_GIDMAP) + err = statmount_string(s, STATMOUNT_MNT_GIDMAP); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); + if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) { + s->sm.mask |= STATMOUNT_SUPPORTED_MASK; + s->sm.supported_mask = STATMOUNT_SUPPORTED; + } + if (err) return err; + /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */ + WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask); + return 0; } @@ -5412,7 +5899,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ - STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2b04038b0e40..bc957487f6ec 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1532,7 +1532,8 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) { if (NFS_PROTO(dir)->version == 2) return 0; - return flags & LOOKUP_EXCL; + return (flags & (LOOKUP_CREATE | LOOKUP_EXCL)) == + (LOOKUP_CREATE | LOOKUP_EXCL); } /* @@ -2421,11 +2422,11 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct iattr attr; - int error; + struct dentry *ret; dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -2434,14 +2435,9 @@ int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, attr.ia_mode = mode | S_IFDIR; trace_nfs_mkdir_enter(dir, dentry); - error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); - trace_nfs_mkdir_exit(dir, dentry, error); - if (error != 0) - goto out_err; - return 0; -out_err: - d_drop(dentry); - return error; + ret = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + trace_nfs_mkdir_exit(dir, dentry, PTR_ERR_OR_ZERO(ret)); + return ret; } EXPORT_SYMBOL_GPL(nfs_mkdir); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index fae2c7ae4acc..1ac1d3eec517 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -400,8 +400,8 @@ struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t); +struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, + umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 0c3bc98cd999..755ed3c37051 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -578,13 +578,13 @@ out: return status; } -static int +static struct dentry * nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - struct dentry *d_alias; - int status = -ENOMEM; + struct dentry *ret = ERR_PTR(-ENOMEM); + int status; dprintk("NFS call mkdir %pd\n", dentry); @@ -592,8 +592,9 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) if (data == NULL) goto out; - status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); - if (status) + ret = ERR_PTR(posix_acl_create(dir, &sattr->ia_mode, + &default_acl, &acl)); + if (IS_ERR(ret)) goto out; data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; @@ -602,25 +603,27 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - d_alias = nfs3_do_create(dir, dentry, data); - status = PTR_ERR_OR_ZERO(d_alias); + ret = nfs3_do_create(dir, dentry, data); - if (status != 0) + if (IS_ERR(ret)) goto out_release_acls; - if (d_alias) - dentry = d_alias; + if (ret) + dentry = ret; status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + if (status) { + dput(ret); + ret = ERR_PTR(status); + } - dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); out: nfs3_free_createdata(data); - dprintk("NFS reply mkdir: %d\n", status); - return status; + dprintk("NFS reply mkdir: %d\n", PTR_ERR_OR_ZERO(ret)); + return ret; } static int diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6e95db6c17e9..70c8ea943019 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3154,9 +3154,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (d_really_is_negative(dentry)) { struct dentry *alias; d_drop(dentry); - alias = d_exact_alias(dentry, state->inode); - if (!alias) - alias = d_splice_alias(igrab(state->inode), dentry); + alias = d_splice_alias(igrab(state->inode), dentry); /* d_splice_alias() can't fail here - it's a non-directory */ if (alias) { dput(ctx->dentry); @@ -5139,9 +5137,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { spin_lock(&dir->i_lock); - /* Creating a directory bumps nlink in the parent */ - if (data->arg.ftype == NF4DIR) - nfs4_inc_nlink_locked(dir); nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, data->res.fattr->time_start, NFS_INO_INVALID_DATA); @@ -5151,6 +5146,25 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ return status; } +static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, + struct nfs4_createdata *data) +{ + int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); + + if (status) + return ERR_PTR(status); + + spin_lock(&dir->i_lock); + /* Creating a directory bumps nlink in the parent */ + nfs4_inc_nlink_locked(dir); + nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, + data->res.fattr->time_start, + NFS_INO_INVALID_DATA); + spin_unlock(&dir->i_lock); + return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); +} + static void nfs4_free_createdata(struct nfs4_createdata *data) { nfs4_label_free(data->fattr.label); @@ -5207,32 +5221,34 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, return err; } -static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr, struct nfs4_label *label) +static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, + struct nfs4_label *label) { struct nfs4_createdata *data; - int status = -ENOMEM; + struct dentry *ret = ERR_PTR(-ENOMEM); data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); if (data == NULL) goto out; data->arg.label = label; - status = nfs4_do_create(dir, dentry, data); + ret = nfs4_do_mkdir(dir, dentry, data); nfs4_free_createdata(data); out: - return status; + return ret; } -static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr) +static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr) { struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { .interruptible = true, }; struct nfs4_label l, *label; + struct dentry *alias; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -5240,14 +5256,15 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); do { - err = _nfs4_proc_mkdir(dir, dentry, sattr, label); + alias = _nfs4_proc_mkdir(dir, dentry, sattr, label); + err = PTR_ERR_OR_ZERO(alias); trace_nfs4_mkdir(dir, &dentry->d_name, err); err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); nfs4_label_release_security(label); - return err; + return alias; } static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 77920a2e3cef..63e71310b9f6 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -446,13 +446,14 @@ out: return status; } -static int +static struct dentry * nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], }; + struct dentry *alias = NULL; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -464,12 +465,15 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + if (status == 0) { + alias = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + status = PTR_ERR_OR_ZERO(alias); + } else + alias = ERR_PTR(status); nfs_free_createdata(data); out: dprintk("NFS reply mkdir: %d\n", status); - return status; + return alias; } static int diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 28f4d5311c40..c1d9bd07285f 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -233,9 +233,12 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); + if (IS_ERR(dentry)) + status = PTR_ERR(dentry); out_put: - dput(dentry); + if (!status) + dput(dentry); out_unlock: inode_unlock(d_inode(dir)); if (status == 0) { diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 29cb7b812d71..34d7aa531662 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1461,7 +1461,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, struct inode *dirp; struct iattr *iap = attrs->na_iattr; __be32 err; - int host_err; + int host_err = 0; dentry = fhp->fh_dentry; dirp = d_inode(dentry); @@ -1488,28 +1488,15 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); - if (!host_err && unlikely(d_unhashed(dchild))) { - struct dentry *d; - d = lookup_one_len(dchild->d_name.name, - dchild->d_parent, - dchild->d_name.len); - if (IS_ERR(d)) { - host_err = PTR_ERR(d); - break; - } - if (unlikely(d_is_negative(d))) { - dput(d); - err = nfserr_serverfault; - goto out; - } + dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); + if (IS_ERR(dchild)) { + host_err = PTR_ERR(dchild); + } else if (d_is_negative(dchild)) { + err = nfserr_serverfault; + goto out; + } else if (unlikely(dchild != resfhp->fh_dentry)) { dput(resfhp->fh_dentry); - resfhp->fh_dentry = dget(d); - err = fh_update(resfhp); - dput(dchild); - dchild = d; - if (err) - goto out; + resfhp->fh_dentry = dget(dchild); } break; case S_IFCHR: @@ -1530,7 +1517,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs); out: - dput(dchild); + if (!IS_ERR(dchild)) + dput(dchild); return err; out_nfserr: diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 953fbd5f0851..40f4b1a28705 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -218,8 +218,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct nilfs_transaction_info ti; @@ -227,7 +227,7 @@ static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, err = nilfs_transaction_begin(dir->i_sb, &ti, 1); if (err) - return err; + return ERR_PTR(err); inc_nlink(dir); @@ -258,7 +258,7 @@ out: else nilfs_transaction_abort(dir->i_sb); - return err; + return ERR_PTR(err); out_fail: drop_nlink(inode); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 95646f7c46ca..6d386080faf2 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -166,6 +166,8 @@ static bool fanotify_should_merge(struct fanotify_event *old, case FANOTIFY_EVENT_TYPE_FS_ERROR: return fanotify_error_event_equal(FANOTIFY_EE(old), FANOTIFY_EE(new)); + case FANOTIFY_EVENT_TYPE_MNT: + return false; default: WARN_ON_ONCE(1); } @@ -312,7 +314,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", __func__, iter_info->report_mask, event_mask, data, data_type); - if (!fid_mode) { + if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { + if (data_type != FSNOTIFY_EVENT_MNT) + return 0; + } else if (!fid_mode) { /* Do we have path to open a file descriptor? */ if (!path) return 0; @@ -557,6 +562,20 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, return &pevent->fae; } +static struct fanotify_event *fanotify_alloc_mnt_event(u64 mnt_id, gfp_t gfp) +{ + struct fanotify_mnt_event *pevent; + + pevent = kmem_cache_alloc(fanotify_mnt_event_cachep, gfp); + if (!pevent) + return NULL; + + pevent->fae.type = FANOTIFY_EVENT_TYPE_MNT; + pevent->mnt_id = mnt_id; + + return &pevent->fae; +} + static struct fanotify_event *fanotify_alloc_perm_event(const void *data, int data_type, gfp_t gfp) @@ -731,6 +750,7 @@ static struct fanotify_event *fanotify_alloc_event( fid_mode); struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); + u64 mnt_id = fsnotify_data_mnt_id(data, data_type); struct mem_cgroup *old_memcg; struct dentry *moved = NULL; struct inode *child = NULL; @@ -826,8 +846,12 @@ static struct fanotify_event *fanotify_alloc_event( moved, &hash, gfp); } else if (fid_mode) { event = fanotify_alloc_fid_event(id, fsid, &hash, gfp); - } else { + } else if (path) { event = fanotify_alloc_path_event(path, &hash, gfp); + } else if (mnt_id) { + event = fanotify_alloc_mnt_event(mnt_id, gfp); + } else { + WARN_ON_ONCE(1); } if (!event) @@ -927,7 +951,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_RENAME != FS_RENAME); BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 24); mask = fanotify_group_event_mask(group, iter_info, &match_mask, mask, data, data_type, dir); @@ -1028,6 +1052,11 @@ static void fanotify_free_error_event(struct fsnotify_group *group, mempool_free(fee, &group->fanotify_data.error_events_pool); } +static void fanotify_free_mnt_event(struct fanotify_event *event) +{ + kmem_cache_free(fanotify_mnt_event_cachep, FANOTIFY_ME(event)); +} + static void fanotify_free_event(struct fsnotify_group *group, struct fsnotify_event *fsn_event) { @@ -1054,6 +1083,9 @@ static void fanotify_free_event(struct fsnotify_group *group, case FANOTIFY_EVENT_TYPE_FS_ERROR: fanotify_free_error_event(group, event); break; + case FANOTIFY_EVENT_TYPE_MNT: + fanotify_free_mnt_event(event); + break; default: WARN_ON_ONCE(1); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index c12cbc270539..b44e70e44be6 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -9,6 +9,7 @@ extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_fid_event_cachep; extern struct kmem_cache *fanotify_path_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; +extern struct kmem_cache *fanotify_mnt_event_cachep; /* Possible states of the permission event */ enum { @@ -244,6 +245,7 @@ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */ + FANOTIFY_EVENT_TYPE_MNT, __FANOTIFY_EVENT_TYPE_NUM }; @@ -409,12 +411,23 @@ struct fanotify_path_event { struct path path; }; +struct fanotify_mnt_event { + struct fanotify_event fae; + u64 mnt_id; +}; + static inline struct fanotify_path_event * FANOTIFY_PE(struct fanotify_event *event) { return container_of(event, struct fanotify_path_event, fae); } +static inline struct fanotify_mnt_event * +FANOTIFY_ME(struct fanotify_event *event) +{ + return container_of(event, struct fanotify_mnt_event, fae); +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the @@ -466,6 +479,11 @@ static inline bool fanotify_is_error_event(u32 mask) return mask & FAN_FS_ERROR; } +static inline bool fanotify_is_mnt_event(u32 mask) +{ + return mask & (FAN_MNT_ATTACH | FAN_MNT_DETACH); +} + static inline const struct path *fanotify_event_path(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_PATH) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ba3e2d09eb44..f2d840ae4ded 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -113,6 +113,7 @@ struct kmem_cache *fanotify_mark_cache __ro_after_init; struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; struct kmem_cache *fanotify_path_event_cachep __ro_after_init; struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; +struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ @@ -123,6 +124,8 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; (sizeof(struct fanotify_event_info_error)) #define FANOTIFY_RANGE_INFO_LEN \ (sizeof(struct fanotify_event_info_range)) +#define FANOTIFY_MNT_INFO_LEN \ + (sizeof(struct fanotify_event_info_mnt)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -178,6 +181,8 @@ static size_t fanotify_event_len(unsigned int info_mode, fh_len = fanotify_event_object_fh_len(event); event_len += fanotify_fid_info_len(fh_len, dot_len); } + if (fanotify_is_mnt_event(event->mask)) + event_len += FANOTIFY_MNT_INFO_LEN; if (info_mode & FAN_REPORT_PIDFD) event_len += FANOTIFY_PIDFD_INFO_LEN; @@ -405,6 +410,25 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } +static size_t copy_mnt_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_event_info_mnt info = { }; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT; + info.hdr.len = FANOTIFY_MNT_INFO_LEN; + + if (WARN_ON(count < info.hdr.len)) + return -EFAULT; + + info.mnt_id = FANOTIFY_ME(event)->mnt_id; + + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + return info.hdr.len; +} + static size_t copy_error_info_to_user(struct fanotify_event *event, char __user *buf, int count) { @@ -700,6 +724,15 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + if (fanotify_is_mnt_event(event->mask)) { + ret = copy_mnt_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + return total_bytes; } @@ -1508,6 +1541,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) return -EINVAL; + /* Don't allow mixing mnt events with inode events for now */ + if (flags & FAN_REPORT_MNT) { + if (class != FAN_CLASS_NOTIF) + return -EINVAL; + if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR)) + return -EINVAL; + } + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -1767,7 +1808,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, int dfd, const char __user *pathname) { struct inode *inode = NULL; - struct vfsmount *mnt = NULL; struct fsnotify_group *group; struct path path; struct fan_fsid __fsid, *fsid = NULL; @@ -1776,7 +1816,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; - void *obj; + void *obj = NULL; u32 umask = 0; int ret; @@ -1800,6 +1840,9 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, case FAN_MARK_FILESYSTEM: obj_type = FSNOTIFY_OBJ_TYPE_SB; break; + case FAN_MARK_MNTNS: + obj_type = FSNOTIFY_OBJ_TYPE_MNTNS; + break; default: return -EINVAL; } @@ -1847,6 +1890,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; group = fd_file(f)->private_data; + /* Only report mount events on mnt namespace */ + if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { + if (mask & ~FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type != FAN_MARK_MNTNS) + return -EINVAL; + } else { + if (mask & FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type == FAN_MARK_MNTNS) + return -EINVAL; + } + /* * An unprivileged user is not allowed to setup mount nor filesystem * marks. This also includes setting up such marks by a group that @@ -1888,7 +1944,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); - if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && + if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) return -EINVAL; @@ -1938,17 +1994,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* inode held in place by reference to path; group by fget on fd */ - if (mark_type == FAN_MARK_INODE) { + if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { inode = path.dentry->d_inode; obj = inode; - } else { - mnt = path.mnt; - if (mark_type == FAN_MARK_MOUNT) - obj = mnt; - else - obj = mnt->mnt_sb; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + obj = path.mnt; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) { + obj = path.mnt->mnt_sb; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { + obj = mnt_ns_from_dentry(path.dentry); } + ret = -EINVAL; + if (!obj) + goto path_put_and_out; + /* * If some other task has this inode open for write we should not add * an ignore mask, unless that ignore mask is supposed to survive @@ -1956,10 +2016,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, */ if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { - ret = mnt ? -EINVAL : -EISDIR; + ret = !inode ? -EINVAL : -EISDIR; /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ if (ignore == FAN_MARK_IGNORE && - (mnt || S_ISDIR(inode->i_mode))) + (!inode || S_ISDIR(inode->i_mode))) goto path_put_and_out; ret = 0; @@ -1968,7 +2028,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ - if (mnt || !S_ISDIR(inode->i_mode)) { + if (!inode || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; umask = FAN_EVENT_ON_CHILD; /* @@ -2042,7 +2102,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fanotify_mark, @@ -2055,6 +2115,7 @@ static int __init fanotify_user_setup(void) fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } + fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC); fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index e933f9c65d90..1161eabf11ee 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -121,6 +121,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", sb->s_dev, mflags, mark->mask, mark->ignore_mask); + } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_MNTNS) { + struct mnt_namespace *mnt_ns = fsnotify_conn_mntns(mark->connector); + + seq_printf(m, "fanotify mnt_ns:%u mflags:%x mask:%x ignored_mask:%x\n", + mnt_ns->ns.inum, mflags, mark->mask, mark->ignore_mask); } } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index fae1b6d397ea..e2b4f17a48bb 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -28,6 +28,11 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) fsnotify_clear_marks_by_mount(mnt); } +void __fsnotify_mntns_delete(struct mnt_namespace *mntns) +{ + fsnotify_clear_marks_by_mntns(mntns); +} + /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. @@ -420,7 +425,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, file_name, cookie, iter_info); } -static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp) +static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; @@ -538,14 +543,15 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); - struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); + struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; - __u32 test_mask, marks_mask; + __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); @@ -578,17 +584,20 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && - (!inode2 || !inode2->i_fsnotify_marks)) + (!inode2 || !inode2->i_fsnotify_marks) && + (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; - marks_mask = READ_ONCE(sb->s_fsnotify_mask); + if (sb) + marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); - + if (mnt_data) + marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. @@ -618,6 +627,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } + if (mnt_data) { + iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = + fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); + } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark @@ -708,11 +721,31 @@ void file_set_fsnotify_mode_from_watchers(struct file *file) } #endif +void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) +{ + struct fsnotify_mnt data = { + .ns = ns, + .mnt_id = real_mount(mnt)->mnt_id_unique, + }; + + if (WARN_ON_ONCE(!ns)) + return; + + /* + * This is an optimization as well as making sure fsnotify_init() has + * been called. + */ + if (!ns->n_fsnotify_marks) + return; + + fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); +} + static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 663759ed6fbc..5950c7a67f41 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -33,6 +33,12 @@ static inline struct super_block *fsnotify_conn_sb( return conn->obj; } +static inline struct mnt_namespace *fsnotify_conn_mntns( + struct fsnotify_mark_connector *conn) +{ + return conn->obj; +} + static inline struct super_block *fsnotify_object_sb(void *obj, enum fsnotify_obj_type obj_type) { @@ -89,6 +95,11 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } +static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns) +{ + fsnotify_destroy_marks(&mntns->n_fsnotify_marks); +} + /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 4981439e6209..798340db69d7 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -107,6 +107,8 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj, return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: return fsnotify_sb_marks(obj); + case FSNOTIFY_OBJ_TYPE_MNTNS: + return &((struct mnt_namespace *)obj)->n_fsnotify_marks; default: return NULL; } @@ -120,6 +122,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; + else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) + return &fsnotify_conn_mntns(conn)->n_fsnotify_mask; return NULL; } @@ -346,12 +350,15 @@ static void *fsnotify_detach_connector_from_object( fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; + } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) { + fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0; } rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; - fsnotify_update_sb_watchers(sb, conn); + if (sb) + fsnotify_update_sb_watchers(sb, conn); return inode; } @@ -724,7 +731,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, * Attach the sb info before attaching a connector to any object on sb. * The sb info will remain attached as long as sb lives. */ - if (!fsnotify_sb_info(sb)) { + if (sb && !fsnotify_sb_info(sb)) { err = fsnotify_attach_info_to_sb(sb); if (err) return err; @@ -770,7 +777,8 @@ restart: /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: - fsnotify_update_sb_watchers(sb, conn); + if (sb) + fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone diff --git a/fs/nsfs.c b/fs/nsfs.c index f7fddf8ecf73..59aa801347a7 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -151,19 +151,49 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, return 0; } +static bool nsfs_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case NS_GET_USERNS: + case NS_GET_PARENT: + case NS_GET_NSTYPE: + case NS_GET_OWNER_UID: + case NS_GET_MNTNS_ID: + case NS_GET_PID_FROM_PIDNS: + case NS_GET_TGID_FROM_PIDNS: + case NS_GET_PID_IN_PIDNS: + case NS_GET_TGID_IN_PIDNS: + return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + } + + /* Extensible ioctls require some extra handling. */ + switch (_IOC_NR(cmd)) { + case _IOC_NR(NS_MNT_GET_INFO): + case _IOC_NR(NS_MNT_GET_NEXT): + case _IOC_NR(NS_MNT_GET_PREV): + return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + } + + return false; +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct user_namespace *user_ns; struct pid_namespace *pid_ns; struct task_struct *tsk; - struct ns_common *ns = get_proc_ns(file_inode(filp)); + struct ns_common *ns; struct mnt_namespace *mnt_ns; bool previous = false; uid_t __user *argp; uid_t uid; int ret; + if (!nsfs_ioctl_valid(ioctl)) + return -ENOIOCTLCMD; + + ns = get_proc_ns(file_inode(filp)); switch (ioctl) { case NS_GET_USERNS: return open_related_ns(ns, ns_get_owner); diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index abf7e81584a9..652735a0b0c4 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -201,11 +201,11 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, /* * ntfs_mkdir- inode_operations::mkdir */ -static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, - NULL, 0, NULL); + return ERR_PTR(ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, + NULL, 0, NULL)); } /* diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 2a7f36643895..5130ec44e5e1 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -402,10 +402,10 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct mnt_idmap * idmap, - struct inode * dir, - struct dentry * dentry, - umode_t mode) +static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap, + struct inode * dir, + struct dentry * dentry, + umode_t mode) { int status; struct inode *inode = NULL; @@ -448,7 +448,7 @@ static int dlmfs_mkdir(struct mnt_idmap * idmap, bail: if (status < 0) iput(inode); - return status; + return ERR_PTR(status); } static int dlmfs_create(struct mnt_idmap *idmap, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 0ec63a1a94b8..99278c8f0e24 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -644,10 +644,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, suballoc_loc, suballoc_bit); } -static int ocfs2_mkdir(struct mnt_idmap *idmap, - struct inode *dir, - struct dentry *dentry, - umode_t mode) +static struct dentry *ocfs2_mkdir(struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { int ret; @@ -657,7 +657,7 @@ static int ocfs2_mkdir(struct mnt_idmap *idmap, if (ret) mlog_errno(ret); - return ret; + return ERR_PTR(ret); } static int ocfs2_create(struct mnt_idmap *idmap, diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 6bda275826d6..2ed541fccf33 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -279,10 +279,10 @@ out_free_inode: return err; } -static int omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return omfs_add_node(dir, dentry, mode | S_IFDIR); + return ERR_PTR(omfs_add_node(dir, dentry, mode | S_IFDIR)); } static int omfs_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/open.c b/fs/open.c index 1be20de9f283..a9063cca9911 100644 --- a/fs/open.c +++ b/fs/open.c @@ -67,11 +67,11 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } -long vfs_truncate(const struct path *path, loff_t length) +int vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; - long error; + int error; inode = path->dentry->d_inode; @@ -123,7 +123,7 @@ mnt_drop_write_and_out: } EXPORT_SYMBOL_GPL(vfs_truncate); -long do_sys_truncate(const char __user *pathname, loff_t length) +int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; @@ -157,7 +157,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -long do_ftruncate(struct file *file, loff_t length, int small) +int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; @@ -196,7 +196,7 @@ long do_ftruncate(struct file *file, loff_t length, int small) return error; } -long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +int do_sys_ftruncate(unsigned int fd, loff_t length, int small) { if (length < 0) return -EINVAL; @@ -251,7 +251,7 @@ COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + int ret; loff_t sum; if (offset < 0 || len <= 0) @@ -460,7 +460,7 @@ static const struct cred *access_override_creds(void) return override_creds(override_cred); } -static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) +static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; @@ -1409,22 +1409,23 @@ struct file *file_open_root(const struct path *root, } EXPORT_SYMBOL(file_open_root); -static long do_sys_openat2(int dfd, const char __user *filename, - struct open_how *how) +static int do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(how, &op); struct filename *tmp; + int err, fd; - if (fd) - return fd; + err = build_open_flags(how, &op); + if (unlikely(err)) + return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); - if (fd >= 0) { + if (likely(fd >= 0)) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); @@ -1437,7 +1438,7 @@ static long do_sys_openat2(int dfd, const char __user *filename, return fd; } -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); @@ -1551,7 +1552,7 @@ int filp_close(struct file *filp, fl_owner_t id) int retval; retval = filp_flush(filp, id); - fput(filp); + fput_close(filp); return retval; } @@ -1577,13 +1578,16 @@ SYSCALL_DEFINE1(close, unsigned int, fd) * We're returning to user space. Don't bother * with any delayed fput() cases. */ - __fput_sync(file); + fput_close_sync(file); + + if (likely(retval == 0)) + return 0; /* can't restart close syscall because file table entry was cleared */ - if (unlikely(retval == -ERESTARTSYS || - retval == -ERESTARTNOINTR || - retval == -ERESTARTNOHAND || - retval == -ERESTART_RESTARTBLOCK)) + if (retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index d68372241b30..90c49c0de243 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -57,8 +57,8 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, int buffer_index; ssize_t ret; size_t copy_amount; - int open_for_read; - int open_for_write; + bool open_for_read; + bool open_for_write; new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); if (!new_op) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index aae6d2b8767d..5ac743c6bc2e 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -16,22 +16,22 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -static int orangefs_writepage_locked(struct page *page, - struct writeback_control *wbc) +static int orangefs_writepage_locked(struct folio *folio, + struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct orangefs_write_range *wr = NULL; struct iov_iter iter; struct bio_vec bv; - size_t len, wlen; + size_t wlen; ssize_t ret; - loff_t off; + loff_t len, off; - set_page_writeback(page); + folio_start_writeback(folio); len = i_size_read(inode); - if (PagePrivate(page)) { - wr = (struct orangefs_write_range *)page_private(page); + if (folio->private) { + wr = folio->private; WARN_ON(wr->pos >= len); off = wr->pos; if (off + wr->len > len) @@ -40,36 +40,27 @@ static int orangefs_writepage_locked(struct page *page, wlen = wr->len; } else { WARN_ON(1); - off = page_offset(page); - if (off + PAGE_SIZE > len) + off = folio_pos(folio); + wlen = folio_size(folio); + + if (wlen > len - off) wlen = len - off; - else - wlen = PAGE_SIZE; } /* Should've been handled in orangefs_invalidate_folio. */ WARN_ON(off == len || off + wlen > len); WARN_ON(wlen == 0); - bvec_set_page(&bv, page, wlen, off % PAGE_SIZE); + bvec_set_folio(&bv, folio, wlen, offset_in_folio(folio, off)); iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, len, wr, NULL, NULL); if (ret < 0) { - mapping_set_error(page->mapping, ret); + mapping_set_error(folio->mapping, ret); } else { ret = 0; } - kfree(detach_page_private(page)); - return ret; -} - -static int orangefs_writepage(struct page *page, struct writeback_control *wbc) -{ - int ret; - ret = orangefs_writepage_locked(page, wbc); - unlock_page(page); - end_page_writeback(page); + kfree(folio_detach_private(folio)); return ret; } @@ -79,33 +70,33 @@ struct orangefs_writepages { kuid_t uid; kgid_t gid; int maxpages; - int npages; - struct page **pages; + int nfolios; + struct address_space *mapping; + struct folio **folios; struct bio_vec *bv; }; static int orangefs_writepages_work(struct orangefs_writepages *ow, - struct writeback_control *wbc) + struct writeback_control *wbc) { - struct inode *inode = ow->pages[0]->mapping->host; + struct inode *inode = ow->mapping->host; struct orangefs_write_range *wrp, wr; struct iov_iter iter; ssize_t ret; - size_t len; - loff_t off; + size_t start; + loff_t len, off; int i; len = i_size_read(inode); - for (i = 0; i < ow->npages; i++) { - set_page_writeback(ow->pages[i]); - bvec_set_page(&ow->bv[i], ow->pages[i], - min(page_offset(ow->pages[i]) + PAGE_SIZE, - ow->off + ow->len) - - max(ow->off, page_offset(ow->pages[i])), - i == 0 ? ow->off - page_offset(ow->pages[i]) : 0); + start = offset_in_folio(ow->folios[0], ow->off); + for (i = 0; i < ow->nfolios; i++) { + folio_start_writeback(ow->folios[i]); + bvec_set_folio(&ow->bv[i], ow->folios[i], + folio_size(ow->folios[i]) - start, start); + start = 0; } - iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); + iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->nfolios, ow->len); WARN_ON(ow->off >= len); if (ow->off + ow->len > len) @@ -116,40 +107,24 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, wr.gid = ow->gid; ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, 0, &wr, NULL, NULL); - if (ret < 0) { - for (i = 0; i < ow->npages; i++) { - mapping_set_error(ow->pages[i]->mapping, ret); - if (PagePrivate(ow->pages[i])) { - wrp = (struct orangefs_write_range *) - page_private(ow->pages[i]); - ClearPagePrivate(ow->pages[i]); - put_page(ow->pages[i]); - kfree(wrp); - } - end_page_writeback(ow->pages[i]); - unlock_page(ow->pages[i]); - } - } else { + if (ret < 0) + mapping_set_error(ow->mapping, ret); + else ret = 0; - for (i = 0; i < ow->npages; i++) { - if (PagePrivate(ow->pages[i])) { - wrp = (struct orangefs_write_range *) - page_private(ow->pages[i]); - ClearPagePrivate(ow->pages[i]); - put_page(ow->pages[i]); - kfree(wrp); - } - end_page_writeback(ow->pages[i]); - unlock_page(ow->pages[i]); - } + + for (i = 0; i < ow->nfolios; i++) { + wrp = folio_detach_private(ow->folios[i]); + kfree(wrp); + folio_end_writeback(ow->folios[i]); + folio_unlock(ow->folios[i]); } + return ret; } static int orangefs_writepages_callback(struct folio *folio, - struct writeback_control *wbc, void *data) + struct writeback_control *wbc, struct orangefs_writepages *ow) { - struct orangefs_writepages *ow = data; struct orangefs_write_range *wr = folio->private; int ret; @@ -162,41 +137,41 @@ static int orangefs_writepages_callback(struct folio *folio, } ret = -1; - if (ow->npages == 0) { + if (ow->nfolios == 0) { ow->off = wr->pos; ow->len = wr->len; ow->uid = wr->uid; ow->gid = wr->gid; - ow->pages[ow->npages++] = &folio->page; + ow->folios[ow->nfolios++] = folio; ret = 0; goto done; } if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) { orangefs_writepages_work(ow, wbc); - ow->npages = 0; + ow->nfolios = 0; ret = -1; goto done; } if (ow->off + ow->len == wr->pos) { ow->len += wr->len; - ow->pages[ow->npages++] = &folio->page; + ow->folios[ow->nfolios++] = folio; ret = 0; goto done; } done: if (ret == -1) { - if (ow->npages) { + if (ow->nfolios) { orangefs_writepages_work(ow, wbc); - ow->npages = 0; + ow->nfolios = 0; } - ret = orangefs_writepage_locked(&folio->page, wbc); + ret = orangefs_writepage_locked(folio, wbc); mapping_set_error(folio->mapping, ret); folio_unlock(folio); folio_end_writeback(folio); } else { - if (ow->npages == ow->maxpages) { + if (ow->nfolios == ow->maxpages) { orangefs_writepages_work(ow, wbc); - ow->npages = 0; + ow->nfolios = 0; } } return ret; @@ -207,31 +182,35 @@ static int orangefs_writepages(struct address_space *mapping, { struct orangefs_writepages *ow; struct blk_plug plug; - int ret; + int error; + struct folio *folio = NULL; + ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL); if (!ow) return -ENOMEM; ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE; - ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL); - if (!ow->pages) { + ow->folios = kcalloc(ow->maxpages, sizeof(struct folio *), GFP_KERNEL); + if (!ow->folios) { kfree(ow); return -ENOMEM; } ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL); if (!ow->bv) { - kfree(ow->pages); + kfree(ow->folios); kfree(ow); return -ENOMEM; } + ow->mapping = mapping; blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow); - if (ow->npages) - ret = orangefs_writepages_work(ow, wbc); + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = orangefs_writepages_callback(folio, wbc, ow); + if (ow->nfolios) + error = orangefs_writepages_work(ow, wbc); blk_finish_plug(&plug); - kfree(ow->pages); + kfree(ow->folios); kfree(ow->bv); kfree(ow); - return ret; + return error; } static int orangefs_launder_folio(struct folio *); @@ -484,7 +463,7 @@ static int orangefs_launder_folio(struct folio *folio) }; folio_wait_writeback(folio); if (folio_clear_dirty_for_io(folio)) { - r = orangefs_writepage_locked(&folio->page, &wbc); + r = orangefs_writepage_locked(folio, &wbc); folio_end_writeback(folio); } return r; @@ -606,7 +585,6 @@ out: /** ORANGEFS2 implementation of address space operations */ static const struct address_space_operations orangefs_address_operations = { - .writepage = orangefs_writepage, .readahead = orangefs_readahead, .read_folio = orangefs_read_folio, .writepages = orangefs_writepages, @@ -616,6 +594,7 @@ static const struct address_space_operations orangefs_address_operations = { .invalidate_folio = orangefs_invalidate_folio, .release_folio = orangefs_release_folio, .free_folio = orangefs_free_folio, + .migrate_folio = filemap_migrate_folio, .launder_folio = orangefs_launder_folio, .direct_IO = orangefs_direct_IO, }; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 200558ec72f0..82395fe2b956 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -300,8 +300,8 @@ out: return ret; } -static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; @@ -312,7 +312,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR); if (!new_op) - return -ENOMEM; + return ERR_PTR(-ENOMEM); new_op->upcall.req.mkdir.parent_refn = parent->refn; @@ -366,7 +366,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, __orangefs_setattr(dir, &iattr); out: op_release(new_op); - return ret; + return ERR_PTR(ret); } static int orangefs_rename(struct mnt_idmap *idmap, diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index edcca4beb765..b562d3dbc76b 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -197,18 +197,6 @@ int orangefs_bufmap_size_query(void) return size; } -int orangefs_bufmap_shift_query(void) -{ - struct orangefs_bufmap *bufmap; - int shift = 0; - spin_lock(&orangefs_bufmap_lock); - bufmap = __orangefs_bufmap; - if (bufmap) - shift = bufmap->desc_shift; - spin_unlock(&orangefs_bufmap_lock); - return shift; -} - static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); @@ -532,16 +520,3 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, } return 0; } - -void orangefs_bufmap_page_fill(void *page_to, - int buffer_index, - int slot_index) -{ - struct orangefs_bufmap_desc *from; - void *page_from; - - from = &__orangefs_bufmap->desc_array[buffer_index]; - page_from = kmap_atomic(from->page_array[slot_index]); - memcpy(page_to, page_from, PAGE_SIZE); - kunmap_atomic(page_from); -} diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h index 75b2d2833af1..4231175ccdb2 100644 --- a/fs/orangefs/orangefs-bufmap.h +++ b/fs/orangefs/orangefs-bufmap.h @@ -10,8 +10,6 @@ int orangefs_bufmap_size_query(void); -int orangefs_bufmap_shift_query(void); - int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc); void orangefs_bufmap_finalize(void); @@ -34,6 +32,5 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, int buffer_index, size_t size); -void orangefs_bufmap_page_fill(void *kaddr, int buffer_index, int slot_index); #endif /* __ORANGEFS_BUFMAP_H */ diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h index 6e079d4230d0..d4463534cec6 100644 --- a/fs/orangefs/orangefs-debug.h +++ b/fs/orangefs/orangefs-debug.h @@ -43,47 +43,4 @@ #define GOSSIP_MAX_NR 16 #define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1) -/* a private internal type */ -struct __keyword_mask_s { - const char *keyword; - __u64 mask_val; -}; - -/* - * Map all kmod keywords to kmod debug masks here. Keep this - * structure "packed": - * - * "all" is always last... - * - * keyword mask_val index - * foo 1 0 - * bar 2 1 - * baz 4 2 - * qux 8 3 - * . . . - */ -static struct __keyword_mask_s s_kmod_keyword_mask_map[] = { - {"super", GOSSIP_SUPER_DEBUG}, - {"inode", GOSSIP_INODE_DEBUG}, - {"file", GOSSIP_FILE_DEBUG}, - {"dir", GOSSIP_DIR_DEBUG}, - {"utils", GOSSIP_UTILS_DEBUG}, - {"wait", GOSSIP_WAIT_DEBUG}, - {"acl", GOSSIP_ACL_DEBUG}, - {"dcache", GOSSIP_DCACHE_DEBUG}, - {"dev", GOSSIP_DEV_DEBUG}, - {"name", GOSSIP_NAME_DEBUG}, - {"bufmap", GOSSIP_BUFMAP_DEBUG}, - {"cache", GOSSIP_CACHE_DEBUG}, - {"debugfs", GOSSIP_DEBUGFS_DEBUG}, - {"xattr", GOSSIP_XATTR_DEBUG}, - {"init", GOSSIP_INIT_DEBUG}, - {"sysfs", GOSSIP_SYSFS_DEBUG}, - {"none", GOSSIP_NO_DEBUG}, - {"all", GOSSIP_MAX_DEBUG} -}; - -static const int num_kmod_keyword_mask_map = (int) - (ARRAY_SIZE(s_kmod_keyword_mask_map)); - #endif /* __ORANGEFS_DEBUG_H */ diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index f52073022fae..f7095c91660c 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -44,6 +44,49 @@ #include "protocol.h" #include "orangefs-kernel.h" +/* a private internal type */ +struct __keyword_mask_s { + const char *keyword; + __u64 mask_val; +}; + +/* + * Map all kmod keywords to kmod debug masks here. Keep this + * structure "packed": + * + * "all" is always last... + * + * keyword mask_val index + * foo 1 0 + * bar 2 1 + * baz 4 2 + * qux 8 3 + * . . . + */ +static struct __keyword_mask_s s_kmod_keyword_mask_map[] = { + {"super", GOSSIP_SUPER_DEBUG}, + {"inode", GOSSIP_INODE_DEBUG}, + {"file", GOSSIP_FILE_DEBUG}, + {"dir", GOSSIP_DIR_DEBUG}, + {"utils", GOSSIP_UTILS_DEBUG}, + {"wait", GOSSIP_WAIT_DEBUG}, + {"acl", GOSSIP_ACL_DEBUG}, + {"dcache", GOSSIP_DCACHE_DEBUG}, + {"dev", GOSSIP_DEV_DEBUG}, + {"name", GOSSIP_NAME_DEBUG}, + {"bufmap", GOSSIP_BUFMAP_DEBUG}, + {"cache", GOSSIP_CACHE_DEBUG}, + {"debugfs", GOSSIP_DEBUGFS_DEBUG}, + {"xattr", GOSSIP_XATTR_DEBUG}, + {"init", GOSSIP_INIT_DEBUG}, + {"sysfs", GOSSIP_SYSFS_DEBUG}, + {"none", GOSSIP_NO_DEBUG}, + {"all", GOSSIP_MAX_DEBUG} +}; + +static const int num_kmod_keyword_mask_map = (int) + (ARRAY_SIZE(s_kmod_keyword_mask_map)); + #define DEBUG_HELP_STRING_SIZE 4096 #define HELP_STRING_UNINITIALIZED \ "Client Debug Keywords are unknown until the first time\n" \ diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index c9993ff66fc2..fe493f3ed6b6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -138,37 +138,6 @@ kill_whiteout: goto out; } -int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, - struct dentry **newdentry, umode_t mode) -{ - int err; - struct dentry *d, *dentry = *newdentry; - - err = ovl_do_mkdir(ofs, dir, dentry, mode); - if (err) - return err; - - if (likely(!d_unhashed(dentry))) - return 0; - - /* - * vfs_mkdir() may succeed and leave the dentry passed - * to it unhashed and negative. If that happens, try to - * lookup a new hashed and positive dentry. - */ - d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); - if (IS_ERR(d)) { - pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", - dentry, err); - return PTR_ERR(d); - } - dput(dentry); - *newdentry = d; - - return 0; -} - struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr) { @@ -191,7 +160,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, case S_IFDIR: /* mkdir is special... */ - err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode); + newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode); + err = PTR_ERR_OR_ZERO(newdentry); break; case S_IFCHR: @@ -219,7 +189,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, } out: if (err) { - dput(newdentry); + if (!IS_ERR(newdentry)) + dput(newdentry); return ERR_PTR(err); } return newdentry; @@ -282,7 +253,8 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, * XXX: if we ever use ovl_obtain_alias() to decode directory * file handles, need to use ovl_get_inode_locked() and * d_instantiate_new() here to prevent from creating two - * hashed directory inode aliases. + * hashed directory inode aliases. We then need to return + * the obtained alias to ovl_mkdir(). */ inode = ovl_get_inode(dentry->d_sb, &oip); if (IS_ERR(inode)) @@ -687,10 +659,10 @@ static int ovl_create(struct mnt_idmap *idmap, struct inode *dir, return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); } -static int ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); + return ERR_PTR(ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL)); } static int ovl_mknod(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0021e2025020..6f2f8f4cfbbc 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -241,13 +241,14 @@ static inline int ovl_do_create(struct ovl_fs *ofs, return err; } -static inline int ovl_do_mkdir(struct ovl_fs *ofs, - struct inode *dir, struct dentry *dentry, - umode_t mode) +static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { - int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); - pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); - return err; + dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry)); + return dentry; } static inline int ovl_do_mknod(struct ovl_fs *ofs, @@ -838,8 +839,6 @@ struct ovl_cattr { #define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) -int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, - struct dentry **newdentry, umode_t mode); struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr); diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 1115c22deca0..6759f7d040c8 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -59,6 +59,7 @@ enum ovl_opt { Opt_metacopy, Opt_verity, Opt_volatile, + Opt_override_creds, }; static const struct constant_table ovl_parameter_bool[] = { @@ -155,6 +156,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool), fsparam_enum("verity", Opt_verity, ovl_parameter_verity), fsparam_flag("volatile", Opt_volatile), + fsparam_flag_no("override_creds", Opt_override_creds), {} }; @@ -662,6 +664,29 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_userxattr: config->userxattr = true; break; + case Opt_override_creds: { + const struct cred *cred = NULL; + + if (result.negated) { + swap(cred, ofs->creator_cred); + put_cred(cred); + break; + } + + if (!current_in_userns(fc->user_ns)) { + err = -EINVAL; + break; + } + + cred = prepare_creds(); + if (cred) + swap(cred, ofs->creator_cred); + else + err = -ENOMEM; + + put_cred(cred); + break; + } default: pr_err("unrecognized mount option \"%s\" or missing value\n", param->key); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 86ae6f6da36b..b63474d1b064 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -327,9 +327,10 @@ retry: goto retry; } - err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode); - if (err) - goto out_dput; + work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); + err = PTR_ERR(work); + if (IS_ERR(work)) + goto out_err; /* Weird filesystem returning with hashed negative (kernfs)? */ err = -EINVAL; @@ -1305,6 +1306,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) { struct ovl_fs *ofs = sb->s_fs_info; struct ovl_fs_context *ctx = fc->fs_private; + const struct cred *old_cred = NULL; struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_layer *layers; @@ -1318,10 +1320,15 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; - ofs->creator_cred = cred = prepare_creds(); + if (!ofs->creator_cred) + ofs->creator_cred = cred = prepare_creds(); + else + cred = (struct cred *)ofs->creator_cred; if (!cred) goto out_err; + old_cred = ovl_override_creds(sb); + err = ovl_fs_params_verify(ctx, &ofs->config); if (err) goto out_err; @@ -1481,11 +1488,19 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_root = root_dentry; + ovl_revert_creds(old_cred); return 0; out_free_oe: ovl_free_entry(oe); out_err: + /* + * Revert creds before calling ovl_free_fs() which will call + * put_cred() and put_cred() requires that the cred's that are + * put are not the caller's creds, i.e., current->cred. + */ + if (old_cred) + ovl_revert_creds(old_cred); ovl_free_fs(ofs); sb->s_fs_info = NULL; return err; diff --git a/fs/pidfs.c b/fs/pidfs.c index c0478b3c55d9..d64a4cbeb0da 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -24,6 +24,28 @@ #include "internal.h" #include "mount.h" +static struct kmem_cache *pidfs_cachep __ro_after_init; + +/* + * Stashes information that userspace needs to access even after the + * process has been reaped. + */ +struct pidfs_exit_info { + __u64 cgroupid; + __s32 exit_code; +}; + +struct pidfs_inode { + struct pidfs_exit_info __pei; + struct pidfs_exit_info *exit_info; + struct inode vfs_inode; +}; + +static inline struct pidfs_inode *pidfs_i(struct inode *inode) +{ + return container_of(inode, struct pidfs_inode, vfs_inode); +} + static struct rb_root pidfs_ino_tree = RB_ROOT; #if BITS_PER_LONG == 32 @@ -188,36 +210,48 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = pidfd_pid(file); - bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); /* - * Depending on PIDFD_THREAD, inform pollers when the thread - * or the whole thread-group exits. + * Don't wake waiters if the thread-group leader exited + * prematurely. They either get notified when the last subthread + * exits or not at all if one of the remaining subthreads execs + * and assumes the struct pid of the old thread-group leader. */ guard(rcu)(); task = pid_task(pid, PIDTYPE_PID); if (!task) poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; - else if (task->exit_state && (thread || thread_group_empty(task))) + else if (task->exit_state && !delay_group_leader(task)) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; } -static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) +static inline bool pid_in_current_pidns(const struct pid *pid) +{ + const struct pid_namespace *ns = task_active_pid_ns(current); + + if (ns->level <= pid->level) + return pid->numbers[ns->level].ns == ns; + + return false; +} + +static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; + struct inode *inode = file_inode(file); + struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; + struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; + struct task_struct *task; const struct cred *c; __u64 mask; -#ifdef CONFIG_CGROUPS - struct cgroup *cgrp; -#endif if (!uinfo) return -EINVAL; @@ -227,6 +261,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) return -EFAULT; + /* + * Restrict information retrieval to tasks within the caller's pid + * namespace hierarchy. + */ + if (!pid_in_current_pidns(pid)) + return -ESRCH; + + if (mask & PIDFD_INFO_EXIT) { + exit_info = READ_ONCE(pidfs_i(inode)->exit_info); + if (exit_info) { + kinfo.mask |= PIDFD_INFO_EXIT; +#ifdef CONFIG_CGROUPS + kinfo.cgroupid = exit_info->cgroupid; + kinfo.mask |= PIDFD_INFO_CGROUPID; +#endif + kinfo.exit_code = exit_info->exit_code; + } + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + /* + * If the task has already been reaped, only exit + * information is available + */ + if (!(mask & PIDFD_INFO_EXIT)) + return -ESRCH; + + goto copy_out; + } + c = get_task_cred(task); if (!c) return -ESRCH; @@ -246,11 +311,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long put_cred(c); #ifdef CONFIG_CGROUPS - rcu_read_lock(); - cgrp = task_dfl_cgroup(task); - kinfo.cgroupid = cgroup_id(cgrp); - kinfo.mask |= PIDFD_INFO_CGROUPID; - rcu_read_unlock(); + if (!kinfo.cgroupid) { + struct cgroup *cgrp; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + kinfo.cgroupid = cgroup_id(cgrp); + kinfo.mask |= PIDFD_INFO_CGROUPID; + rcu_read_unlock(); + } #endif /* @@ -270,16 +339,14 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) return -ESRCH; +copy_out: /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that * userspace knows about will be copied. If userspace provides a new * struct, only the bits that the kernel knows about will be copied. */ - if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) - return -EFAULT; - - return 0; + return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL); } static bool pidfs_ioctl_valid(unsigned int cmd) @@ -317,7 +384,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; struct nsproxy *nsp __free(put_nsproxy) = NULL; - struct pid *pid = pidfd_pid(file); struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; @@ -332,13 +398,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return put_user(file_inode(file)->i_generation, argp); } - task = get_pid_task(pid, PIDTYPE_PID); - if (!task) - return -ESRCH; - /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) - return pidfd_info(task, cmd, arg); + return pidfd_info(file, cmd, arg); + + task = get_pid_task(pidfd_pid(file), PIDTYPE_PID); + if (!task) + return -ESRCH; if (arg) return -EINVAL; @@ -450,6 +516,49 @@ struct pid *pidfd_pid(const struct file *file) return file_inode(file)->i_private; } +/* + * We're called from release_task(). We know there's at least one + * reference to struct pid being held that won't be released until the + * task has been reaped which cannot happen until we're out of + * release_task(). + * + * If this struct pid is referred to by a pidfd then + * stashed_dentry_get() will return the dentry and inode for that struct + * pid. Since we've taken a reference on it there's now an additional + * reference from the exit path on it. Which is fine. We're going to put + * it again in a second and we know that the pid is kept alive anyway. + * + * Worst case is that we've filled in the info and immediately free the + * dentry and inode afterwards since the pidfd has been closed. Since + * pidfs_exit() currently is placed after exit_task_work() we know that + * it cannot be us aka the exiting task holding a pidfd to ourselves. + */ +void pidfs_exit(struct task_struct *tsk) +{ + struct dentry *dentry; + + might_sleep(); + + dentry = stashed_dentry_get(&task_pid(tsk)->stashed); + if (dentry) { + struct inode *inode = d_inode(dentry); + struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(tsk); + exit_info->cgroupid = cgroup_id(cgrp); + rcu_read_unlock(); +#endif + exit_info->exit_code = tsk->exit_code; + + /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ + smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei); + dput(dentry); + } +} + static struct vfsmount *pidfs_mnt __ro_after_init; /* @@ -505,9 +614,30 @@ static void pidfs_evict_inode(struct inode *inode) put_pid(pid); } +static struct inode *pidfs_alloc_inode(struct super_block *sb) +{ + struct pidfs_inode *pi; + + pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL); + if (!pi) + return NULL; + + memset(&pi->__pei, 0, sizeof(pi->__pei)); + pi->exit_info = NULL; + + return &pi->vfs_inode; +} + +static void pidfs_free_inode(struct inode *inode) +{ + kmem_cache_free(pidfs_cachep, pidfs_i(inode)); +} + static const struct super_operations pidfs_sops = { + .alloc_inode = pidfs_alloc_inode, .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, + .free_inode = pidfs_free_inode, .statfs = simple_statfs, }; @@ -633,8 +763,49 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } +static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, + unsigned int flags) +{ + enum pid_type type; + + if (flags & PIDFD_CLONE) + return true; + + /* + * Make sure that if a pidfd is created PIDFD_INFO_EXIT + * information will be available. So after an inode for the + * pidfd has been allocated perform another check that the pid + * is still alive. If it is exit information is available even + * if the task gets reaped before the pidfd is returned to + * userspace. The only exception is PIDFD_CLONE where no task + * linkage has been established for @pid yet and the kernel is + * in the middle of process creation so there's nothing for + * pidfs to miss. + */ + if (flags & PIDFD_THREAD) + type = PIDTYPE_PID; + else + type = PIDTYPE_TGID; + + /* + * Since pidfs_exit() is called before struct pid's task linkage + * is removed the case where the task got reaped but a dentry + * was already attached to struct pid and exit information was + * recorded and published can be handled correctly. + */ + if (unlikely(!pid_has_task(pid, type))) { + struct inode *inode = d_inode(path->dentry); + return !!READ_ONCE(pidfs_i(inode)->exit_info); + } + + return true; +} + static struct file *pidfs_export_open(struct path *path, unsigned int oflags) { + if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags)) + return ERR_PTR(-ESRCH); + /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are. @@ -698,22 +869,46 @@ static struct file_system_type pidfs_type = { struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) { - struct file *pidfd_file; - struct path path; + struct path path __free(path_put) = {}; int ret; + /* + * Ensure that PIDFD_CLONE can be passed as a flag without + * overloading other uapi pidfd flags. + */ + BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD); + BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK); + ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) return ERR_PTR(ret); + if (!pidfs_pid_valid(pid, &path, flags)) + return ERR_PTR(-ESRCH); + + flags &= ~PIDFD_CLONE; pidfd_file = dentry_open(&path, flags, current_cred()); - path_put(&path); + /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ + if (!IS_ERR(pidfd_file)) + pidfd_file->f_flags |= (flags & PIDFD_THREAD); + return pidfd_file; } +static void pidfs_inode_init_once(void *data) +{ + struct pidfs_inode *pi = data; + + inode_init_once(&pi->vfs_inode); +} + void __init pidfs_init(void) { + pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0, + (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT | SLAB_PANIC), + pidfs_inode_init_once); pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); diff --git a/fs/pipe.c b/fs/pipe.c index 4d0799e4e719..da45edd68c41 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -112,20 +112,40 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, pipe_lock(pipe2); } +static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe) +{ + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) { + struct page *page = pipe->tmp_page[i]; + pipe->tmp_page[i] = NULL; + return page; + } + } + + return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); +} + +static void anon_pipe_put_page(struct pipe_inode_info *pipe, + struct page *page) +{ + if (page_count(page) == 1) { + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (!pipe->tmp_page[i]) { + pipe->tmp_page[i] = page; + return; + } + } + } + + put_page(page); +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; - /* - * If nobody else uses this page, and we don't already have a - * temporary page, let's keep track of it as a one-deep - * allocation cache. (Otherwise just release our reference to it) - */ - if (page_count(page) == 1 && !pipe->tmp_page) - pipe->tmp_page = page; - else - put_page(page); + anon_pipe_put_page(pipe, page); } static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, @@ -247,7 +267,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, } static ssize_t -pipe_read(struct kiocb *iocb, struct iov_iter *to) +anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; @@ -274,7 +294,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; #ifdef CONFIG_WATCH_QUEUE if (pipe->note_loss) { @@ -301,7 +320,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) #endif if (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t chars = buf->len; size_t written; int error; @@ -359,29 +378,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) break; } mutex_unlock(&pipe->mutex); - /* * We only get here if we didn't actually read anything. * - * However, we could have seen (and removed) a zero-sized - * pipe buffer, and might have made space in the buffers - * that way. - * - * You can't make zero-sized pipe buffers by doing an empty - * write (not even in packet mode), but they can happen if - * the writer gets an EFAULT when trying to fill a buffer - * that already got allocated and inserted in the buffer - * array. - * - * So we still need to wake up any pending writers in the - * _very_ unlikely case that the pipe was full, but we got - * no data. - */ - if (unlikely(wake_writer)) - wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - - /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need @@ -390,7 +389,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - wake_writer = false; wake_next_reader = true; mutex_lock(&pipe->mutex); } @@ -403,8 +401,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + return ret; +} + +static ssize_t +fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to) +{ + int ret = anon_pipe_read(iocb, to); if (ret > 0) - file_accessed(filp); + file_accessed(iocb->ki_filp); return ret; } @@ -424,7 +429,7 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe) } static ssize_t -pipe_write(struct kiocb *iocb, struct iov_iter *from) +anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; @@ -471,8 +476,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); if (chars && !was_empty) { - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head - 1); int offset = buf->offset + buf->len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && @@ -503,54 +507,44 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { - unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf; - struct page *page = pipe->tmp_page; + struct page *page; int copied; - if (!page) { - page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); - if (unlikely(!page)) { - ret = ret ? : -ENOMEM; - break; - } - pipe->tmp_page = page; + page = anon_pipe_get_page(pipe); + if (unlikely(!page)) { + if (!ret) + ret = -ENOMEM; + break; } - /* Allocate a slot in the ring in advance and attach an - * empty buffer. If we fault or otherwise fail to use - * it, either the reader will consume it or it'll still - * be there for the next write. - */ - pipe->head = head + 1; + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + anon_pipe_put_page(pipe, page); + if (!ret) + ret = -EFAULT; + break; + } + pipe->head = head + 1; /* Insert it into the buffer array */ - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; - buf->len = 0; if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; - pipe->tmp_page = NULL; - copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); - if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { - if (!ret) - ret = -EFAULT; - break; - } - ret += copied; buf->len = copied; + ret += copied; if (!iov_iter_count(from)) break; - } - if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; + } /* Wait for buffer space to become available. */ if ((filp->f_flags & O_NONBLOCK) || @@ -602,11 +596,21 @@ out: kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { - int err = file_update_time(filp); - if (err) - ret = err; - sb_end_write(file_inode(filp)->i_sb); + return ret; +} + +static ssize_t +fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from) +{ + int ret = anon_pipe_write(iocb, from); + if (ret > 0) { + struct file *filp = iocb->ki_filp; + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { + int err = file_update_time(filp); + if (err) + ret = err; + sb_end_write(file_inode(filp)->i_sb); + } } return ret; } @@ -853,8 +857,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (pipe->watch_queue) put_watch_queue(pipe->watch_queue); #endif - if (pipe->tmp_page) - __free_page(pipe->tmp_page); + for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) + __free_page(pipe->tmp_page[i]); + } kfree(pipe->bufs); kfree(pipe); } @@ -874,6 +880,8 @@ static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; +static const struct file_operations pipeanon_fops; + static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); @@ -891,7 +899,7 @@ static struct inode * get_pipe_inode(void) inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; - inode->i_fop = &pipefifo_fops; + inode->i_fop = &pipeanon_fops; /* * Mark the inode dirty from the very beginning, @@ -934,7 +942,7 @@ int create_pipe_files(struct file **res, int flags) f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(f)) { free_pipe_info(inode->i_pipe); iput(inode); @@ -945,7 +953,7 @@ int create_pipe_files(struct file **res, int flags) f->f_pipe = 0; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(res[0])) { put_pipe_info(inode, inode->i_pipe); fput(f); @@ -1109,8 +1117,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe) static int fifo_open(struct inode *inode, struct file *filp) { + bool is_pipe = inode->i_fop == &pipeanon_fops; struct pipe_inode_info *pipe; - bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_pipe = 0; @@ -1234,8 +1242,19 @@ err: const struct file_operations pipefifo_fops = { .open = fifo_open, - .read_iter = pipe_read, - .write_iter = pipe_write, + .read_iter = fifo_pipe_read, + .write_iter = fifo_pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, + .splice_write = iter_file_splice_write, +}; + +static const struct file_operations pipeanon_fops = { + .open = fifo_open, + .read_iter = anon_pipe_read, + .write_iter = anon_pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, @@ -1271,6 +1290,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) struct pipe_buffer *bufs; unsigned int head, tail, mask, n; + /* nr_slots larger than limits of pipe->{head,tail} */ + if (unlikely(nr_slots > (pipe_index_t)-1u)) + return -EINVAL; + bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) @@ -1390,7 +1413,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { struct pipe_inode_info *pipe = file->private_data; - if (file->f_op != &pipefifo_fops || !pipe) + if (!pipe) + return NULL; + if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops) return NULL; if (for_splice && pipe_has_watch_queue(pipe)) return NULL; diff --git a/fs/pnode.c b/fs/pnode.c index ef048f008bdd..7a062a5de10e 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -150,7 +150,7 @@ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); while (1) { @@ -174,7 +174,7 @@ static struct mount *skip_propagation_subtree(struct mount *m, * Advance m such that propagation_next will not return * the slaves of m. */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) m = last_slave(m); return m; @@ -185,7 +185,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) while (1) { while (1) { struct mount *next; - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { @@ -226,7 +226,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp) struct mount *child; int type; /* skip ones added by this propagate_mnt() */ - if (IS_MNT_NEW(m)) + if (IS_MNT_PROPAGATED(m)) return 0; /* skip if mountpoint isn't covered by it */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) @@ -380,7 +380,7 @@ bool propagation_would_overmount(const struct mount *from, if (!IS_MNT_SHARED(from)) return false; - if (IS_MNT_NEW(to)) + if (IS_MNT_PROPAGATED(to)) return false; if (to->mnt.mnt_root != mp->m_dentry) @@ -549,8 +549,10 @@ static void restore_mounts(struct list_head *to_restore) mp = parent->mnt_mp; parent = parent->mnt_parent; } - if (parent != mnt->mnt_parent) + if (parent != mnt->mnt_parent) { mnt_change_mountpoint(parent, mp, mnt); + mnt_notify_add(mnt); + } } } diff --git a/fs/pnode.h b/fs/pnode.h index 0b02a6393891..ddafe0d087ca 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -12,7 +12,7 @@ #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) -#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns)) +#define IS_MNT_PROPAGATED(m) (!(m)->mnt_ns || ((m)->mnt_ns->mntns_flags & MNTNS_PROPAGATING)) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) diff --git a/fs/proc/base.c b/fs/proc/base.c index cd89e956c322..5538c4aee8fa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1489,7 +1489,6 @@ static const struct file_operations proc_fail_nth_operations = { #endif -#ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: */ @@ -1539,8 +1538,6 @@ static const struct file_operations proc_pid_sched_operations = { .release = single_release, }; -#endif - #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: @@ -2497,11 +2494,9 @@ static const struct file_operations proc_map_files_operations = { #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { - struct pid *pid; - struct task_struct *task; - struct sighand_struct *sighand; - struct pid_namespace *ns; - unsigned long flags; + struct pid *pid; + struct task_struct *task; + struct pid_namespace *ns; }; static void *timers_start(struct seq_file *m, loff_t *pos) @@ -2512,54 +2507,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos) if (!tp->task) return ERR_PTR(-ESRCH); - tp->sighand = lock_task_sighand(tp->task, &tp->flags); - if (!tp->sighand) - return ERR_PTR(-ESRCH); - - return seq_hlist_start(&tp->task->signal->posix_timers, *pos); + rcu_read_lock(); + return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_hlist_next(v, &tp->task->signal->posix_timers, pos); + + return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; - if (tp->sighand) { - unlock_task_sighand(tp->task, &tp->flags); - tp->sighand = NULL; - } - if (tp->task) { put_task_struct(tp->task); tp->task = NULL; + rcu_read_unlock(); } } static int show_timer(struct seq_file *m, void *v) { - struct k_itimer *timer; - struct timers_private *tp = m->private; - int notify; static const char * const nstr[] = { - [SIGEV_SIGNAL] = "signal", - [SIGEV_NONE] = "none", - [SIGEV_THREAD] = "thread", + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", }; - timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); - notify = timer->it_sigev_notify; + struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); + struct timers_private *tp = m->private; + int notify = timer->it_sigev_notify; + + guard(spinlock_irq)(&timer->it_lock); + if (!posixtimer_valid(timer)) + return 0; seq_printf(m, "ID: %d\n", timer->it_id); - seq_printf(m, "signal: %d/%px\n", - timer->sigq.info.si_signo, + seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, timer->sigq.info.si_value.sival_ptr); - seq_printf(m, "notify: %s/%s.%d\n", - nstr[notify & ~SIGEV_THREAD_ID], + seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); @@ -3331,9 +3320,7 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif @@ -3682,9 +3669,7 @@ static const struct pid_entry tid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 1cb33771bf9f..728630b10fdf 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -34,8 +34,6 @@ #include <asm/sections.h> #include "internal.h" -#define CORE_STR "CORE" - #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif @@ -122,7 +120,9 @@ static void update_kcore_size(void) kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr); kcore_notes_len = (4 * sizeof(struct elf_note) + - 3 * ALIGN(sizeof(CORE_STR), 4) + + ALIGN(sizeof(NN_PRSTATUS), 4) + + ALIGN(sizeof(NN_PRPSINFO), 4) + + ALIGN(sizeof(NN_TASKSTRUCT), 4) + VMCOREINFO_NOTE_NAME_BYTES + ALIGN(sizeof(struct elf_prstatus), 4) + ALIGN(sizeof(struct elf_prpsinfo), 4) + @@ -443,11 +443,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) goto out; } - append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus, + append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); - append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo, + append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); - append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, + append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current, arch_task_struct_size); /* * vmcoreinfo_size is mostly constant after init time, but it diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 56815799ce79..bb3b769edc71 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -14,10 +14,10 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/string.h> -#include <linux/mount.h> #include <linux/seq_file.h> #include <linux/ramfs.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/sched.h> #include <linux/magic.h> #include <linux/pstore.h> @@ -226,37 +226,38 @@ static struct inode *pstore_get_inode(struct super_block *sb) } enum { - Opt_kmsg_bytes, Opt_err + Opt_kmsg_bytes }; -static const match_table_t tokens = { - {Opt_kmsg_bytes, "kmsg_bytes=%u"}, - {Opt_err, NULL} +static const struct fs_parameter_spec pstore_param_spec[] = { + fsparam_u32 ("kmsg_bytes", Opt_kmsg_bytes), + {} }; -static void parse_options(char *options) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return; +struct pstore_context { + unsigned int kmsg_bytes; +}; - while ((p = strsep(&options, ",")) != NULL) { - int token; +static int pstore_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct pstore_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; - if (!*p) - continue; + opt = fs_parse(fc, pstore_param_spec, param, &result); + /* pstore has historically ignored invalid kmsg_bytes param */ + if (opt < 0) + return 0; - token = match_token(p, tokens, args); - switch (token) { - case Opt_kmsg_bytes: - if (!match_int(&args[0], &option)) - pstore_set_kmsg_bytes(option); - break; - } + switch (opt) { + case Opt_kmsg_bytes: + ctx->kmsg_bytes = result.uint_32; + break; + default: + return -EINVAL; } + + return 0; } /* @@ -265,14 +266,16 @@ static void parse_options(char *options) static int pstore_show_options(struct seq_file *m, struct dentry *root) { if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) - seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); + seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes); return 0; } -static int pstore_remount(struct super_block *sb, int *flags, char *data) +static int pstore_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - parse_options(data); + struct pstore_context *ctx = fc->fs_private; + + sync_filesystem(fc->root->d_sb); + pstore_set_kmsg_bytes(ctx->kmsg_bytes); return 0; } @@ -281,7 +284,6 @@ static const struct super_operations pstore_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .evict_inode = pstore_evict_inode, - .remount_fs = pstore_remount, .show_options = pstore_show_options, }; @@ -406,8 +408,9 @@ void pstore_get_records(int quiet) inode_unlock(d_inode(root)); } -static int pstore_fill_super(struct super_block *sb, void *data, int silent) +static int pstore_fill_super(struct super_block *sb, struct fs_context *fc) { + struct pstore_context *ctx = fc->fs_private; struct inode *inode; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -417,7 +420,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &pstore_ops; sb->s_time_gran = 1; - parse_options(data); + pstore_set_kmsg_bytes(ctx->kmsg_bytes); inode = pstore_get_inode(sb); if (inode) { @@ -438,12 +441,26 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry *pstore_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int pstore_get_tree(struct fs_context *fc) +{ + if (fc->root) + return pstore_reconfigure(fc); + + return get_tree_single(fc, pstore_fill_super); +} + +static void pstore_free_fc(struct fs_context *fc) { - return mount_single(fs_type, flags, data, pstore_fill_super); + kfree(fc->fs_private); } +static const struct fs_context_operations pstore_context_ops = { + .parse_param = pstore_parse_param, + .get_tree = pstore_get_tree, + .reconfigure = pstore_reconfigure, + .free = pstore_free_fc, +}; + static void pstore_kill_sb(struct super_block *sb) { guard(mutex)(&pstore_sb_lock); @@ -456,11 +473,33 @@ static void pstore_kill_sb(struct super_block *sb) INIT_LIST_HEAD(&records_list); } +static int pstore_init_fs_context(struct fs_context *fc) +{ + struct pstore_context *ctx; + + ctx = kzalloc(sizeof(struct pstore_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + /* + * Global kmsg_bytes is initialized to default, and updated + * every time we (re)mount the single-sb filesystem with the + * option specified. + */ + ctx->kmsg_bytes = kmsg_bytes; + + fc->fs_private = ctx; + fc->ops = &pstore_context_ops; + + return 0; +} + static struct file_system_type pstore_fs_type = { .owner = THIS_MODULE, .name = "pstore", - .mount = pstore_mount, .kill_sb = pstore_kill_sb, + .init_fs_context = pstore_init_fs_context, + .parameters = pstore_param_spec, }; int __init pstore_init_fs(void) diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 801d6c0b170c..a0fc51196910 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -6,7 +6,7 @@ #include <linux/time.h> #include <linux/pstore.h> -extern unsigned long kmsg_bytes; +extern unsigned int kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); @@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {} extern struct pstore_info *psinfo; -extern void pstore_set_kmsg_bytes(int); +extern void pstore_set_kmsg_bytes(unsigned int bytes); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f56b066ab80c..557cf9d40177 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -92,8 +92,8 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* How much of the kernel log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; -module_param(kmsg_bytes, ulong, 0444); +unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, uint, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); static void *compress_workspace; @@ -107,9 +107,9 @@ static void *compress_workspace; static char *big_oops_buf; static size_t max_compressed_size; -void pstore_set_kmsg_bytes(int bytes) +void pstore_set_kmsg_bytes(unsigned int bytes) { - kmsg_bytes = bytes; + WRITE_ONCE(kmsg_bytes, bytes); } /* Tag each group of saved records with a sequence number */ @@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, struct kmsg_dump_detail *detail) { struct kmsg_dump_iter iter; + unsigned int remaining = READ_ONCE(kmsg_bytes); unsigned long total = 0; const char *why; unsigned int part = 1; @@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, kmsg_dump_rewind(&iter); oopscount++; - while (total < kmsg_bytes) { + while (total < remaining) { char *dst; size_t dst_size; int header_size; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 8006faaaf0ec..775fa905fda0 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -119,13 +119,13 @@ out: return error; } -static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); - return retval; + return ERR_PTR(retval); } static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/read_write.c b/fs/read_write.c index a6133241dfb8..bb0ed26a0b3a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -169,11 +169,16 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, if (whence == SEEK_CUR) { /* - * f_lock protects against read/modify/write race with - * other SEEK_CURs. Note that parallel writes and reads - * behave like SEEK_SET. + * If the file requires locking via f_pos_lock we know + * that mutual exclusion for SEEK_CUR on the same file + * is guaranteed. If the file isn't locked, we take + * f_lock to protect against f_pos races with other + * SEEK_CURs. */ - guard(spinlock)(&file->f_lock); + if (file_seek_cur_needs_f_lock(file)) { + guard(spinlock)(&file->f_lock); + return vfs_setpos(file, file->f_pos + offset, maxsize); + } return vfs_setpos(file, file->f_pos + offset, maxsize); } diff --git a/fs/signalfd.c b/fs/signalfd.c index d1a5f43ce466..d469782f97f4 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -277,15 +277,14 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) return ufd; } - file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx, - O_RDWR | (flags & O_NONBLOCK)); + file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops, + ctx, O_RDWR | (flags & O_NONBLOCK), + FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } - file->f_mode |= FMODE_NOWAIT; - fd_install(ufd, file); } else { CLASS(fd, f)(ufd); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 831fee962c4d..8dea0cf3a8de 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -59,8 +59,8 @@ extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); extern int cifs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -extern int cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t); +extern struct dentry *cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, + umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); extern int cifs_rename2(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 8582cf61242c..9e4f7378f30f 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -388,7 +388,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) spin_unlock(&tcon->tc_lock); /* - * BB Add call to invalidate_inodes(sb) for all superblocks mounted + * BB Add call to evict_inodes(sb) for all superblocks mounted * to this tcon. */ } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 616149c7f0a5..3bb21aa58474 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2207,8 +2207,8 @@ posix_mkdir_get_info: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, - struct dentry *direntry, umode_t mode) +struct dentry *cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, + struct dentry *direntry, umode_t mode) { int rc = 0; unsigned int xid; @@ -2224,10 +2224,10 @@ int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, cifs_sb = CIFS_SB(inode->i_sb); if (unlikely(cifs_forced_shutdown(cifs_sb))) - return -EIO; + return ERR_PTR(-EIO); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) - return PTR_ERR(tlink); + return ERR_CAST(tlink); tcon = tlink_tcon(tlink); xid = get_xid(); @@ -2283,7 +2283,7 @@ mkdir_out: free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); - return rc; + return ERR_PTR(rc); } int cifs_rmdir(struct inode *inode, struct dentry *direntry) diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 6890016e1923..8554aa5a1059 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -113,11 +113,6 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, if (IS_ERR(d)) goto err_out; - if (d_is_negative(d)) { - dput(d); - goto err_out; - } - path->dentry = d; path->mnt = mntget(parent_path->mnt); @@ -211,8 +206,8 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) { struct mnt_idmap *idmap; struct path path; - struct dentry *dentry; - int err; + struct dentry *dentry, *d; + int err = 0; dentry = ksmbd_vfs_kern_path_create(work, name, LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, @@ -227,27 +222,15 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; - err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); - if (!err && d_unhashed(dentry)) { - struct dentry *d; - - d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); - if (IS_ERR(d)) { - err = PTR_ERR(d); - goto out_err; - } - if (unlikely(d_is_negative(d))) { - dput(d); - err = -ENOENT; - goto out_err; - } - - ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); - dput(d); - } + d = dentry; + dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); + if (IS_ERR(dentry)) + err = PTR_ERR(dentry); + else if (d_is_negative(dentry)) + err = -ENOENT; + if (!err && dentry != d) + ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); -out_err: done_path_create(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); @@ -693,6 +676,7 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, struct ksmbd_file *parent_fp; int new_type; int err, lookup_flags = LOOKUP_NO_SYMLINKS; + int target_lookup_flags = LOOKUP_RENAME_TARGET; if (ksmbd_override_fsids(work)) return -ENOMEM; @@ -703,6 +687,14 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, goto revert_fsids; } + /* + * explicitly handle file overwrite case, for compatibility with + * filesystems that may not support rename flags (e.g: fuse) + */ + if (flags & RENAME_NOREPLACE) + target_lookup_flags |= LOOKUP_EXCL; + flags &= ~(RENAME_NOREPLACE); + retry: err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH, &new_path, &new_last, &new_type, @@ -743,7 +735,7 @@ retry: } new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, - lookup_flags | LOOKUP_RENAME_TARGET); + lookup_flags | target_lookup_flags); if (IS_ERR(new_dentry)) { err = PTR_ERR(new_dentry); goto out3; @@ -754,16 +746,6 @@ retry: goto out4; } - /* - * explicitly handle file overwrite case, for compatibility with - * filesystems that may not support rename flags (e.g: fuse) - */ - if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) { - err = -EEXIST; - goto out4; - } - flags &= ~(RENAME_NOREPLACE); - if (old_child == trap) { err = -EINVAL; goto out4; diff --git a/fs/splice.c b/fs/splice.c index 23fa5561b944..90d464241f15 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -200,7 +200,6 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; - unsigned int mask = pipe->ring_size - 1; ssize_t ret = 0; int page_nr = 0; @@ -214,7 +213,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, } while (!pipe_full(head, tail, pipe->max_usage)) { - struct pipe_buffer *buf = &pipe->bufs[head & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head); buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; @@ -247,7 +246,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { @@ -256,7 +254,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { - pipe->bufs[head & mask] = *buf; + *pipe_buf(pipe, head) = *buf; pipe->head = head + 1; return buf->len; } @@ -447,11 +445,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); sd->len = buf->len; if (sd->len > sd->total_len) @@ -495,8 +492,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); @@ -690,7 +686,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; - unsigned int head, tail, mask; + unsigned int head, tail; size_t left; int n; @@ -711,12 +707,11 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ @@ -752,7 +747,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (ret >= buf->len) { ret -= buf->len; buf->len = 0; @@ -809,7 +804,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, pipe_lock(pipe); while (len > 0) { - unsigned int head, tail, mask, bc = 0; + unsigned int head, tail, bc = 0; size_t remain = len; /* @@ -846,10 +841,9 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg; if (!buf->len) { @@ -894,7 +888,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, len -= ret; tail = pipe->tail; while (ret > 0) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; @@ -1725,7 +1719,6 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; int ret = 0; bool input_wakeup = false; @@ -1747,9 +1740,7 @@ retry: pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { size_t o_len; @@ -1792,8 +1783,8 @@ retry: goto retry; } - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); if (len >= ibuf->len) { /* @@ -1862,7 +1853,6 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; ssize_t ret = 0; /* @@ -1873,9 +1863,7 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { if (!opipe->readers) { @@ -1896,8 +1884,8 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_full(o_head, o_tail, opipe->max_usage)) break; - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); /* * Get a reference to this pipe buffer, diff --git a/fs/super.c b/fs/super.c index 5a7db4a556e3..97a17f9d9023 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1417,7 +1417,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); - invalidate_inodes(sb); + evict_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); @@ -1737,61 +1737,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); -int reconfigure_single(struct super_block *s, - int flags, void *data) -{ - struct fs_context *fc; - int ret; - - /* The caller really need to be passing fc down into mount_single(), - * then a chunk of this can be removed. [Bollocks -- AV] - * Better yet, reconfiguration shouldn't happen, but rather the second - * mount should be rejected if the parameters are not compatible. - */ - fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); - if (IS_ERR(fc)) - return PTR_ERR(fc); - - ret = parse_monolithic_mount_data(fc, data); - if (ret < 0) - goto out; - - ret = reconfigure_super(fc); -out: - put_fs_context(fc); - return ret; -} - -static int compare_single(struct super_block *s, void *p) -{ - return 1; -} - -struct dentry *mount_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - - s = sget(fs_type, compare_single, set_anon_super, flags, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - if (!s->s_root) { - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (!error) - s->s_flags |= SB_ACTIVE; - } else { - error = reconfigure_single(s, flags, data); - } - if (unlikely(error)) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_single); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig deleted file mode 100644 index 67b3f90afbfd..000000000000 --- a/fs/sysv/Kconfig +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -config SYSV_FS - tristate "System V/Xenix/V7/Coherent file system support" - depends on BLOCK - select BUFFER_HEAD - help - SCO, Xenix and Coherent are commercial Unix systems for Intel - machines, and Version 7 was used on the DEC PDP-11. Saying Y - here would allow you to read from their floppies and hard disk - partitions. - - If you have floppies or hard disk partitions like that, it is likely - that they contain binaries from those other Unix systems; in order - to run these binaries, you will want to install linux-abi which is - a set of kernel modules that lets you run SCO, Xenix, Wyse, - UnixWare, Dell Unix and System V programs under Linux. It is - available via FTP (user: ftp) from - <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>). - NOTE: that will work only for binaries from Intel-based systems; - PDP ones will have to wait until somebody ports Linux to -11 ;-) - - If you only intend to mount files from some other Unix over the - network using NFS, you don't need the System V file system support - (but you need NFS file system support obviously). - - Note that this option is generally not needed for floppies, since a - good portable way to transport files and directories between unixes - (and even other operating systems) is given by the tar program ("man - tar" or preferably "info tar"). Note also that this option has - nothing whatsoever to do with the option "System V IPC". Read about - the System V file system in - <file:Documentation/filesystems/sysv-fs.rst>. - Saying Y here will enlarge your kernel by about 27 KB. - - To compile this as a module, choose M here: the module will be called - sysv. - - If you haven't heard about all of this before, it's safe to say N. diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile deleted file mode 100644 index 17d12ba04b18..000000000000 --- a/fs/sysv/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Makefile for the Linux SystemV/Coherent filesystem routines. -# - -obj-$(CONFIG_SYSV_FS) += sysv.o - -sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \ - namei.o super.o diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c deleted file mode 100644 index 0e69dbdf7277..000000000000 --- a/fs/sysv/balloc.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/balloc.c - * - * minix/bitmap.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext/freelists.c - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * - * xenix/alloc.c - * Copyright (C) 1992 Doug Evans - * - * coh/alloc.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/balloc.c - * Copyright (C) 1993 Bruno Haible - * - * This file contains code for allocating/freeing blocks. - */ - -#include <linux/buffer_head.h> -#include <linux/string.h> -#include "sysv.h" - -/* We don't trust the value of - sb->sv_sbd2->s_tfree = *sb->sv_free_blocks - but we nevertheless keep it up to date. */ - -static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh) -{ - char *bh_data = bh->b_data; - - if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4) - return (sysv_zone_t*)(bh_data+4); - else - return (sysv_zone_t*)(bh_data+2); -} - -/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */ - -void sysv_free_block(struct super_block * sb, sysv_zone_t nr) -{ - struct sysv_sb_info * sbi = SYSV_SB(sb); - struct buffer_head * bh; - sysv_zone_t *blocks = sbi->s_bcache; - unsigned count; - unsigned block = fs32_to_cpu(sbi, nr); - - /* - * This code does not work at all for AFS (it has a bitmap - * free list). As AFS is supposed to be read-only no one - * should call this for an AFS filesystem anyway... - */ - if (sbi->s_type == FSTYPE_AFS) - return; - - if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { - printk("sysv_free_block: trying to free block not in datazone\n"); - return; - } - - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_bcache_count); - - if (count > sbi->s_flc_size) { - printk("sysv_free_block: flc_count > flc_size\n"); - mutex_unlock(&sbi->s_lock); - return; - } - /* If the free list head in super-block is full, it is copied - * into this block being freed, ditto if it's completely empty - * (applies only on Coherent). - */ - if (count == sbi->s_flc_size || count == 0) { - block += sbi->s_block_base; - bh = sb_getblk(sb, block); - if (!bh) { - printk("sysv_free_block: getblk() failed\n"); - mutex_unlock(&sbi->s_lock); - return; - } - memset(bh->b_data, 0, sb->s_blocksize); - *(__fs16*)bh->b_data = cpu_to_fs16(sbi, count); - memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t)); - mark_buffer_dirty(bh); - set_buffer_uptodate(bh); - brelse(bh); - count = 0; - } - sbi->s_bcache[count++] = nr; - - *sbi->s_bcache_count = cpu_to_fs16(sbi, count); - fs32_add(sbi, sbi->s_free_blocks, 1); - dirty_sb(sb); - mutex_unlock(&sbi->s_lock); -} - -sysv_zone_t sysv_new_block(struct super_block * sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - unsigned int block; - sysv_zone_t nr; - struct buffer_head * bh; - unsigned count; - - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_bcache_count); - - if (count == 0) /* Applies only to Coherent FS */ - goto Enospc; - nr = sbi->s_bcache[--count]; - if (nr == 0) /* Applies only to Xenix FS, SystemV FS */ - goto Enospc; - - block = fs32_to_cpu(sbi, nr); - - *sbi->s_bcache_count = cpu_to_fs16(sbi, count); - - if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { - printk("sysv_new_block: new block %d is not in data zone\n", - block); - goto Enospc; - } - - if (count == 0) { /* the last block continues the free list */ - unsigned count; - - block += sbi->s_block_base; - if (!(bh = sb_bread(sb, block))) { - printk("sysv_new_block: cannot read free-list block\n"); - /* retry this same block next time */ - *sbi->s_bcache_count = cpu_to_fs16(sbi, 1); - goto Enospc; - } - count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); - if (count > sbi->s_flc_size) { - printk("sysv_new_block: free-list block with >flc_size entries\n"); - brelse(bh); - goto Enospc; - } - *sbi->s_bcache_count = cpu_to_fs16(sbi, count); - memcpy(sbi->s_bcache, get_chunk(sb, bh), - count * sizeof(sysv_zone_t)); - brelse(bh); - } - /* Now the free list head in the superblock is valid again. */ - fs32_add(sbi, sbi->s_free_blocks, -1); - dirty_sb(sb); - mutex_unlock(&sbi->s_lock); - return nr; - -Enospc: - mutex_unlock(&sbi->s_lock); - return 0; -} - -unsigned long sysv_count_free_blocks(struct super_block * sb) -{ - struct sysv_sb_info * sbi = SYSV_SB(sb); - int sb_count; - int count; - struct buffer_head * bh = NULL; - sysv_zone_t *blocks; - unsigned block; - int n; - - /* - * This code does not work at all for AFS (it has a bitmap - * free list). As AFS is supposed to be read-only we just - * lie and say it has no free block at all. - */ - if (sbi->s_type == FSTYPE_AFS) - return 0; - - mutex_lock(&sbi->s_lock); - sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks); - - if (0) - goto trust_sb; - - /* this causes a lot of disk traffic ... */ - count = 0; - n = fs16_to_cpu(sbi, *sbi->s_bcache_count); - blocks = sbi->s_bcache; - while (1) { - sysv_zone_t zone; - if (n > sbi->s_flc_size) - goto E2big; - zone = 0; - while (n && (zone = blocks[--n]) != 0) - count++; - if (zone == 0) - break; - - block = fs32_to_cpu(sbi, zone); - if (bh) - brelse(bh); - - if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) - goto Einval; - block += sbi->s_block_base; - bh = sb_bread(sb, block); - if (!bh) - goto Eio; - n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); - blocks = get_chunk(sb, bh); - } - if (bh) - brelse(bh); - if (count != sb_count) - goto Ecount; -done: - mutex_unlock(&sbi->s_lock); - return count; - -Einval: - printk("sysv_count_free_blocks: new block %d is not in data zone\n", - block); - goto trust_sb; -Eio: - printk("sysv_count_free_blocks: cannot read free-list block\n"); - goto trust_sb; -E2big: - printk("sysv_count_free_blocks: >flc_size entries in free-list block\n"); - if (bh) - brelse(bh); -trust_sb: - count = sb_count; - goto done; -Ecount: - printk("sysv_count_free_blocks: free block count was %d, " - "correcting to %d\n", sb_count, count); - if (!sb_rdonly(sb)) { - *sbi->s_free_blocks = cpu_to_fs32(sbi, count); - dirty_sb(sb); - } - goto done; -} diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c deleted file mode 100644 index 639307e2ff8c..000000000000 --- a/fs/sysv/dir.c +++ /dev/null @@ -1,378 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/dir.c - * - * minix/dir.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * coh/dir.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/dir.c - * Copyright (C) 1993 Bruno Haible - * - * SystemV/Coherent directory handling functions - */ - -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include "sysv.h" - -static int sysv_readdir(struct file *, struct dir_context *); - -const struct file_operations sysv_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate_shared = sysv_readdir, - .fsync = generic_file_fsync, -}; - -static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) -{ - struct address_space *mapping = folio->mapping; - struct inode *dir = mapping->host; - - block_write_end(NULL, mapping, pos, len, len, folio, NULL); - if (pos+len > dir->i_size) { - i_size_write(dir, pos+len); - mark_inode_dirty(dir); - } - folio_unlock(folio); -} - -static int sysv_handle_dirsync(struct inode *dir) -{ - int err; - - err = filemap_write_and_wait(dir->i_mapping); - if (!err) - err = sync_inode_metadata(dir, 1); - return err; -} - -/* - * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the - * rules documented in mm/highmem.rst. - * - * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio() - * and must be treated accordingly for nesting purposes. - */ -static void *dir_get_folio(struct inode *dir, unsigned long n, - struct folio **foliop) -{ - struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); - - if (IS_ERR(folio)) - return ERR_CAST(folio); - *foliop = folio; - return kmap_local_folio(folio, 0); -} - -static int sysv_readdir(struct file *file, struct dir_context *ctx) -{ - unsigned long pos = ctx->pos; - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - unsigned long npages = dir_pages(inode); - unsigned offset; - unsigned long n; - - ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); - if (pos >= inode->i_size) - return 0; - - offset = pos & ~PAGE_MASK; - n = pos >> PAGE_SHIFT; - - for ( ; n < npages; n++, offset = 0) { - char *kaddr, *limit; - struct sysv_dir_entry *de; - struct folio *folio; - - kaddr = dir_get_folio(inode, n, &folio); - if (IS_ERR(kaddr)) - continue; - de = (struct sysv_dir_entry *)(kaddr+offset); - limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE; - for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) { - char *name = de->name; - - if (!de->inode) - continue; - - if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), - fs16_to_cpu(SYSV_SB(sb), de->inode), - DT_UNKNOWN)) { - folio_release_kmap(folio, kaddr); - return 0; - } - } - folio_release_kmap(folio, kaddr); - } - return 0; -} - -/* compare strings: name[0..len-1] (not zero-terminated) and - * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1]) - */ -static inline int namecompare(int len, int maxlen, - const char * name, const char * buffer) -{ - if (len < maxlen && buffer[len]) - return 0; - return !memcmp(name, buffer, len); -} - -/* - * sysv_find_entry() - * - * finds an entry in the specified directory with the wanted name. - * It does NOT read the inode of the - * entry - you'll have to do that yourself if you want to. - * - * On Success folio_release_kmap() should be called on *foliop. - * - * sysv_find_entry() acts as a call to dir_get_folio() and must be treated - * accordingly for nesting purposes. - */ -struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop) -{ - const char * name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct inode * dir = d_inode(dentry->d_parent); - unsigned long start, n; - unsigned long npages = dir_pages(dir); - struct sysv_dir_entry *de; - - start = SYSV_I(dir)->i_dir_start_lookup; - if (start >= npages) - start = 0; - n = start; - - do { - char *kaddr = dir_get_folio(dir, n, foliop); - - if (!IS_ERR(kaddr)) { - de = (struct sysv_dir_entry *)kaddr; - kaddr += folio_size(*foliop) - SYSV_DIRSIZE; - for ( ; (char *) de <= kaddr ; de++) { - if (!de->inode) - continue; - if (namecompare(namelen, SYSV_NAMELEN, - name, de->name)) - goto found; - } - folio_release_kmap(*foliop, kaddr); - } - - if (++n >= npages) - n = 0; - } while (n != start); - - return NULL; - -found: - SYSV_I(dir)->i_dir_start_lookup = n; - return de; -} - -int sysv_add_link(struct dentry *dentry, struct inode *inode) -{ - struct inode *dir = d_inode(dentry->d_parent); - const char * name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct folio *folio = NULL; - struct sysv_dir_entry * de; - unsigned long npages = dir_pages(dir); - unsigned long n; - char *kaddr; - loff_t pos; - int err; - - /* We take care of directory expansion in the same loop */ - for (n = 0; n <= npages; n++) { - kaddr = dir_get_folio(dir, n, &folio); - if (IS_ERR(kaddr)) - return PTR_ERR(kaddr); - de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_SIZE - SYSV_DIRSIZE; - while ((char *)de <= kaddr) { - if (!de->inode) - goto got_it; - err = -EEXIST; - if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) - goto out_folio; - de++; - } - folio_release_kmap(folio, kaddr); - } - BUG(); - return -EINVAL; - -got_it: - pos = folio_pos(folio) + offset_in_folio(folio, de); - folio_lock(folio); - err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); - if (err) - goto out_unlock; - memcpy (de->name, name, namelen); - memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(folio, pos, SYSV_DIRSIZE); - inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - mark_inode_dirty(dir); - err = sysv_handle_dirsync(dir); -out_folio: - folio_release_kmap(folio, kaddr); - return err; -out_unlock: - folio_unlock(folio); - goto out_folio; -} - -int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) -{ - struct inode *inode = folio->mapping->host; - loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); - int err; - - folio_lock(folio); - err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); - if (err) { - folio_unlock(folio); - return err; - } - de->inode = 0; - dir_commit_chunk(folio, pos, SYSV_DIRSIZE); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - mark_inode_dirty(inode); - return sysv_handle_dirsync(inode); -} - -int sysv_make_empty(struct inode *inode, struct inode *dir) -{ - struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); - struct sysv_dir_entry * de; - char *kaddr; - int err; - - if (IS_ERR(folio)) - return PTR_ERR(folio); - err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE); - if (err) { - folio_unlock(folio); - goto fail; - } - kaddr = kmap_local_folio(folio, 0); - memset(kaddr, 0, folio_size(folio)); - - de = (struct sysv_dir_entry *)kaddr; - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - strcpy(de->name,"."); - de++; - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); - strcpy(de->name,".."); - - kunmap_local(kaddr); - dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE); - err = sysv_handle_dirsync(inode); -fail: - folio_put(folio); - return err; -} - -/* - * routine to check that the specified directory is empty (for rmdir) - */ -int sysv_empty_dir(struct inode * inode) -{ - struct super_block *sb = inode->i_sb; - struct folio *folio = NULL; - unsigned long i, npages = dir_pages(inode); - char *kaddr; - - for (i = 0; i < npages; i++) { - struct sysv_dir_entry *de; - - kaddr = dir_get_folio(inode, i, &folio); - if (IS_ERR(kaddr)) - continue; - - de = (struct sysv_dir_entry *)kaddr; - kaddr += folio_size(folio) - SYSV_DIRSIZE; - - for ( ;(char *)de <= kaddr; de++) { - if (!de->inode) - continue; - /* check for . and .. */ - if (de->name[0] != '.') - goto not_empty; - if (!de->name[1]) { - if (de->inode == cpu_to_fs16(SYSV_SB(sb), - inode->i_ino)) - continue; - goto not_empty; - } - if (de->name[1] != '.' || de->name[2]) - goto not_empty; - } - folio_release_kmap(folio, kaddr); - } - return 1; - -not_empty: - folio_release_kmap(folio, kaddr); - return 0; -} - -/* Releases the page */ -int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio, - struct inode *inode) -{ - struct inode *dir = folio->mapping->host; - loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); - int err; - - folio_lock(folio); - err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); - if (err) { - folio_unlock(folio); - return err; - } - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(folio, pos, SYSV_DIRSIZE); - inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - mark_inode_dirty(dir); - return sysv_handle_dirsync(inode); -} - -/* - * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the - * rules documented in mm/highmem.rst. - * - * sysv_dotdot() acts as a call to dir_get_folio() and must be treated - * accordingly for nesting purposes. - */ -struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop) -{ - struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop); - - if (IS_ERR(de)) - return NULL; - /* ".." is the second directory entry */ - return de + 1; -} - -ino_t sysv_inode_by_name(struct dentry *dentry) -{ - struct folio *folio; - struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio); - ino_t res = 0; - - if (de) { - res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - folio_release_kmap(folio, de); - } - return res; -} diff --git a/fs/sysv/file.c b/fs/sysv/file.c deleted file mode 100644 index c645f60bdb7f..000000000000 --- a/fs/sysv/file.c +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/file.c - * - * minix/file.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * coh/file.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/file.c - * Copyright (C) 1993 Bruno Haible - * - * SystemV/Coherent regular file handling primitives - */ - -#include "sysv.h" - -/* - * We have mostly NULLs here: the current defaults are OK for - * the coh filesystem. - */ -const struct file_operations sysv_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, - .fsync = generic_file_fsync, - .splice_read = filemap_splice_read, -}; - -static int sysv_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - int error; - - error = setattr_prepare(&nop_mnt_idmap, dentry, attr); - if (error) - return error; - - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = inode_newsize_ok(inode, attr->ia_size); - if (error) - return error; - truncate_setsize(inode, attr->ia_size); - sysv_truncate(inode); - } - - setattr_copy(&nop_mnt_idmap, inode, attr); - mark_inode_dirty(inode); - return 0; -} - -const struct inode_operations sysv_file_inode_operations = { - .setattr = sysv_setattr, - .getattr = sysv_getattr, -}; diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c deleted file mode 100644 index 269df6d49815..000000000000 --- a/fs/sysv/ialloc.c +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/ialloc.c - * - * minix/bitmap.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext/freelists.c - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * - * xenix/alloc.c - * Copyright (C) 1992 Doug Evans - * - * coh/alloc.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/ialloc.c - * Copyright (C) 1993 Bruno Haible - * - * This file contains code for allocating/freeing inodes. - */ - -#include <linux/kernel.h> -#include <linux/stddef.h> -#include <linux/sched.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/buffer_head.h> -#include <linux/writeback.h> -#include "sysv.h" - -/* We don't trust the value of - sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes - but we nevertheless keep it up to date. */ - -/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */ - -/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */ -static inline sysv_ino_t * -sv_sb_fic_inode(struct super_block * sb, unsigned int i) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - if (sbi->s_bh1 == sbi->s_bh2) - return &sbi->s_sb_fic_inodes[i]; - else { - /* 512 byte Xenix FS */ - unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]); - if (offset < 512) - return (sysv_ino_t*)(sbi->s_sbd1 + offset); - else - return (sysv_ino_t*)(sbi->s_sbd2 + offset); - } -} - -struct sysv_inode * -sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct sysv_inode *res; - int block = sbi->s_firstinodezone + sbi->s_block_base; - - block += (ino-1) >> sbi->s_inodes_per_block_bits; - *bh = sb_bread(sb, block); - if (!*bh) - return NULL; - res = (struct sysv_inode *)(*bh)->b_data; - return res + ((ino-1) & sbi->s_inodes_per_block_1); -} - -static int refill_free_cache(struct super_block *sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - int i = 0, ino; - - ino = SYSV_ROOT_INO+1; - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto out; - while (ino <= sbi->s_ninodes) { - if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) { - *sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino); - if (i == sbi->s_fic_size) - break; - } - if ((ino++ & sbi->s_inodes_per_block_1) == 0) { - brelse(bh); - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto out; - } else - raw_inode++; - } - brelse(bh); -out: - return i; -} - -void sysv_free_inode(struct inode * inode) -{ - struct super_block *sb = inode->i_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - unsigned int ino; - struct buffer_head * bh; - struct sysv_inode * raw_inode; - unsigned count; - - sb = inode->i_sb; - ino = inode->i_ino; - if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) { - printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n"); - return; - } - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) { - printk("sysv_free_inode: unable to read inode block on device " - "%s\n", inode->i_sb->s_id); - return; - } - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); - if (count < sbi->s_fic_size) { - *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino); - *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); - } - fs16_add(sbi, sbi->s_sb_total_free_inodes, 1); - dirty_sb(sb); - memset(raw_inode, 0, sizeof(struct sysv_inode)); - mark_buffer_dirty(bh); - mutex_unlock(&sbi->s_lock); - brelse(bh); -} - -struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) -{ - struct super_block *sb = dir->i_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct inode *inode; - sysv_ino_t ino; - unsigned count; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE - }; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); - if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) { - count = refill_free_cache(sb); - if (count == 0) { - iput(inode); - mutex_unlock(&sbi->s_lock); - return ERR_PTR(-ENOSPC); - } - } - /* Now count > 0. */ - ino = *sv_sb_fic_inode(sb,--count); - *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); - fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); - dirty_sb(sb); - inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_ino = fs16_to_cpu(sbi, ino); - simple_inode_init_ts(inode); - inode->i_blocks = 0; - memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); - SYSV_I(inode)->i_dir_start_lookup = 0; - insert_inode_hash(inode); - mark_inode_dirty(inode); - - sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */ - mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ - /* That's it. */ - mutex_unlock(&sbi->s_lock); - return inode; -} - -unsigned long sysv_count_free_inodes(struct super_block * sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - int ino, count, sb_count; - - mutex_lock(&sbi->s_lock); - - sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes); - - if (0) - goto trust_sb; - - /* this causes a lot of disk traffic ... */ - count = 0; - ino = SYSV_ROOT_INO+1; - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto Eio; - while (ino <= sbi->s_ninodes) { - if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) - count++; - if ((ino++ & sbi->s_inodes_per_block_1) == 0) { - brelse(bh); - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto Eio; - } else - raw_inode++; - } - brelse(bh); - if (count != sb_count) - goto Einval; -out: - mutex_unlock(&sbi->s_lock); - return count; - -Einval: - printk("sysv_count_free_inodes: " - "free inode count was %d, correcting to %d\n", - sb_count, count); - if (!sb_rdonly(sb)) { - *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count); - dirty_sb(sb); - } - goto out; - -Eio: - printk("sysv_count_free_inodes: unable to read inode table\n"); -trust_sb: - count = sb_count; - goto out; -} diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c deleted file mode 100644 index 76bc2d5e75a9..000000000000 --- a/fs/sysv/inode.c +++ /dev/null @@ -1,354 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/inode.c - * - * minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * xenix/inode.c - * Copyright (C) 1992 Doug Evans - * - * coh/inode.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/inode.c - * Copyright (C) 1993 Paul B. Monday - * - * sysv/inode.c - * Copyright (C) 1993 Bruno Haible - * Copyright (C) 1997, 1998 Krzysztof G. Baranowski - * - * This file contains code for allocating/freeing inodes and for read/writing - * the superblock. - */ - -#include <linux/highuid.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/buffer_head.h> -#include <linux/vfs.h> -#include <linux/writeback.h> -#include <linux/namei.h> -#include <asm/byteorder.h> -#include "sysv.h" - -static int sysv_sync_fs(struct super_block *sb, int wait) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - u32 time = (u32)ktime_get_real_seconds(), old_time; - - mutex_lock(&sbi->s_lock); - - /* - * If we are going to write out the super block, - * then attach current time stamp. - * But if the filesystem was marked clean, keep it clean. - */ - old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); - if (sbi->s_type == FSTYPE_SYSV4) { - if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time)) - *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time); - *sbi->s_sb_time = cpu_to_fs32(sbi, time); - mark_buffer_dirty(sbi->s_bh2); - } - - mutex_unlock(&sbi->s_lock); - - return 0; -} - -static int sysv_remount(struct super_block *sb, int *flags, char *data) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - sync_filesystem(sb); - if (sbi->s_forced_ro) - *flags |= SB_RDONLY; - return 0; -} - -static void sysv_put_super(struct super_block *sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - if (!sb_rdonly(sb)) { - /* XXX ext2 also updates the state here */ - mark_buffer_dirty(sbi->s_bh1); - if (sbi->s_bh1 != sbi->s_bh2) - mark_buffer_dirty(sbi->s_bh2); - } - - brelse(sbi->s_bh1); - if (sbi->s_bh1 != sbi->s_bh2) - brelse(sbi->s_bh2); - - kfree(sbi); -} - -static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - - buf->f_type = sb->s_magic; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = sbi->s_ndatazones; - buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb); - buf->f_files = sbi->s_ninodes; - buf->f_ffree = sysv_count_free_inodes(sb); - buf->f_namelen = SYSV_NAMELEN; - buf->f_fsid = u64_to_fsid(id); - return 0; -} - -/* - * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32 - */ -static inline void read3byte(struct sysv_sb_info *sbi, - unsigned char * from, unsigned char * to) -{ - if (sbi->s_bytesex == BYTESEX_PDP) { - to[0] = from[0]; - to[1] = 0; - to[2] = from[1]; - to[3] = from[2]; - } else if (sbi->s_bytesex == BYTESEX_LE) { - to[0] = from[0]; - to[1] = from[1]; - to[2] = from[2]; - to[3] = 0; - } else { - to[0] = 0; - to[1] = from[0]; - to[2] = from[1]; - to[3] = from[2]; - } -} - -static inline void write3byte(struct sysv_sb_info *sbi, - unsigned char * from, unsigned char * to) -{ - if (sbi->s_bytesex == BYTESEX_PDP) { - to[0] = from[0]; - to[1] = from[2]; - to[2] = from[3]; - } else if (sbi->s_bytesex == BYTESEX_LE) { - to[0] = from[0]; - to[1] = from[1]; - to[2] = from[2]; - } else { - to[0] = from[1]; - to[1] = from[2]; - to[2] = from[3]; - } -} - -static const struct inode_operations sysv_symlink_inode_operations = { - .get_link = page_get_link, - .getattr = sysv_getattr, -}; - -void sysv_set_inode(struct inode *inode, dev_t rdev) -{ - if (S_ISREG(inode->i_mode)) { - inode->i_op = &sysv_file_inode_operations; - inode->i_fop = &sysv_file_operations; - inode->i_mapping->a_ops = &sysv_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &sysv_dir_inode_operations; - inode->i_fop = &sysv_dir_operations; - inode->i_mapping->a_ops = &sysv_aops; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &sysv_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &sysv_aops; - } else - init_special_inode(inode, inode->i_mode, rdev); -} - -struct inode *sysv_iget(struct super_block *sb, unsigned int ino) -{ - struct sysv_sb_info * sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - struct sysv_inode_info * si; - struct inode *inode; - unsigned int block; - - if (!ino || ino > sbi->s_ninodes) { - printk("Bad inode number on dev %s: %d is out of range\n", - sb->s_id, ino); - return ERR_PTR(-EIO); - } - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) { - printk("Major problem: unable to read inode from dev %s\n", - inode->i_sb->s_id); - goto bad_inode; - } - /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */ - inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); - i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid)); - i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid)); - set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); - inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); - inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0); - inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0); - inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0); - inode->i_blocks = 0; - - si = SYSV_I(inode); - for (block = 0; block < 10+1+1+1; block++) - read3byte(sbi, &raw_inode->i_data[3*block], - (u8 *)&si->i_data[block]); - brelse(bh); - si->i_dir_start_lookup = 0; - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - sysv_set_inode(inode, - old_decode_dev(fs32_to_cpu(sbi, si->i_data[0]))); - else - sysv_set_inode(inode, 0); - unlock_new_inode(inode); - return inode; - -bad_inode: - iget_failed(inode); - return ERR_PTR(-EIO); -} - -static int __sysv_write_inode(struct inode *inode, int wait) -{ - struct super_block * sb = inode->i_sb; - struct sysv_sb_info * sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - struct sysv_inode_info * si; - unsigned int ino, block; - int err = 0; - - ino = inode->i_ino; - if (!ino || ino > sbi->s_ninodes) { - printk("Bad inode number on dev %s: %d is out of range\n", - inode->i_sb->s_id, ino); - return -EIO; - } - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) { - printk("unable to read i-node block\n"); - return -EIO; - } - - raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); - raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode))); - raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode))); - raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink); - raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); - raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode)); - raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode)); - raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode)); - - si = SYSV_I(inode); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev)); - for (block = 0; block < 10+1+1+1; block++) - write3byte(sbi, (u8 *)&si->i_data[block], - &raw_inode->i_data[3*block]); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - printk ("IO error syncing sysv inode [%s:%08x]\n", - sb->s_id, ino); - err = -EIO; - } - } - brelse(bh); - return err; -} - -int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); -} - -int sysv_sync_inode(struct inode *inode) -{ - return __sysv_write_inode(inode, 1); -} - -static void sysv_evict_inode(struct inode *inode) -{ - truncate_inode_pages_final(&inode->i_data); - if (!inode->i_nlink) { - inode->i_size = 0; - sysv_truncate(inode); - } - invalidate_inode_buffers(inode); - clear_inode(inode); - if (!inode->i_nlink) - sysv_free_inode(inode); -} - -static struct kmem_cache *sysv_inode_cachep; - -static struct inode *sysv_alloc_inode(struct super_block *sb) -{ - struct sysv_inode_info *si; - - si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL); - if (!si) - return NULL; - return &si->vfs_inode; -} - -static void sysv_free_in_core_inode(struct inode *inode) -{ - kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); -} - -static void init_once(void *p) -{ - struct sysv_inode_info *si = (struct sysv_inode_info *)p; - - inode_init_once(&si->vfs_inode); -} - -const struct super_operations sysv_sops = { - .alloc_inode = sysv_alloc_inode, - .free_inode = sysv_free_in_core_inode, - .write_inode = sysv_write_inode, - .evict_inode = sysv_evict_inode, - .put_super = sysv_put_super, - .sync_fs = sysv_sync_fs, - .remount_fs = sysv_remount, - .statfs = sysv_statfs, -}; - -int __init sysv_init_icache(void) -{ - sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", - sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, - init_once); - if (!sysv_inode_cachep) - return -ENOMEM; - return 0; -} - -void sysv_destroy_icache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(sysv_inode_cachep); -} diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c deleted file mode 100644 index 451e95f474fa..000000000000 --- a/fs/sysv/itree.c +++ /dev/null @@ -1,511 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/itree.c - * - * Handling of indirect blocks' trees. - * AV, Sep--Dec 2000 - */ - -#include <linux/buffer_head.h> -#include <linux/mount.h> -#include <linux/mpage.h> -#include <linux/string.h> -#include "sysv.h" - -enum {DIRECT = 10, DEPTH = 4}; /* Have triple indirect */ - -static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode) -{ - mark_buffer_dirty_inode(bh, inode); - if (IS_SYNC(inode)) - sync_dirty_buffer(bh); -} - -static int block_to_path(struct inode *inode, long block, int offsets[DEPTH]) -{ - struct super_block *sb = inode->i_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - int ptrs_bits = sbi->s_ind_per_block_bits; - unsigned long indirect_blocks = sbi->s_ind_per_block, - double_blocks = sbi->s_ind_per_block_2; - int n = 0; - - if (block < 0) { - printk("sysv_block_map: block < 0\n"); - } else if (block < DIRECT) { - offsets[n++] = block; - } else if ( (block -= DIRECT) < indirect_blocks) { - offsets[n++] = DIRECT; - offsets[n++] = block; - } else if ((block -= indirect_blocks) < double_blocks) { - offsets[n++] = DIRECT+1; - offsets[n++] = block >> ptrs_bits; - offsets[n++] = block & (indirect_blocks - 1); - } else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) { - offsets[n++] = DIRECT+2; - offsets[n++] = block >> (ptrs_bits * 2); - offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1); - offsets[n++] = block & (indirect_blocks - 1); - } else { - /* nothing */; - } - return n; -} - -static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr) -{ - return sbi->s_block_base + fs32_to_cpu(sbi, nr); -} - -typedef struct { - sysv_zone_t *p; - sysv_zone_t key; - struct buffer_head *bh; -} Indirect; - -static DEFINE_RWLOCK(pointers_lock); - -static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -static inline int verify_chain(Indirect *from, Indirect *to) -{ - while (from <= to && from->key == *from->p) - from++; - return (from > to); -} - -static inline sysv_zone_t *block_end(struct buffer_head *bh) -{ - return (sysv_zone_t*)((char*)bh->b_data + bh->b_size); -} - -static Indirect *get_branch(struct inode *inode, - int depth, - int offsets[], - Indirect chain[], - int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - int block = block_to_cpu(SYSV_SB(sb), p->key); - bh = sb_bread(sb, block); - if (!bh) - goto failure; - read_lock(&pointers_lock); - if (!verify_chain(chain, p)) - goto changed; - add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets); - read_unlock(&pointers_lock); - if (!p->key) - goto no_block; - } - return NULL; - -changed: - read_unlock(&pointers_lock); - brelse(bh); - *err = -EAGAIN; - goto no_block; -failure: - *err = -EIO; -no_block: - return p; -} - -static int alloc_branch(struct inode *inode, - int num, - int *offsets, - Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int n = 0; - int i; - - branch[0].key = sysv_new_block(inode->i_sb); - if (branch[0].key) for (n = 1; n < num; n++) { - struct buffer_head *bh; - int parent; - /* Allocate the next block */ - branch[n].key = sysv_new_block(inode->i_sb); - if (!branch[n].key) - break; - /* - * Get buffer_head for parent block, zero it out and set - * the pointer to new one, then send parent to disk. - */ - parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key); - bh = sb_getblk(inode->i_sb, parent); - if (!bh) { - sysv_free_block(inode->i_sb, branch[n].key); - break; - } - lock_buffer(bh); - memset(bh->b_data, 0, blocksize); - branch[n].bh = bh; - branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n]; - *branch[n].p = branch[n].key; - set_buffer_uptodate(bh); - unlock_buffer(bh); - dirty_indirect(bh, inode); - } - if (n == num) - return 0; - - /* Allocation failed, free what we already allocated */ - for (i = 1; i < n; i++) - bforget(branch[i].bh); - for (i = 0; i < n; i++) - sysv_free_block(inode->i_sb, branch[i].key); - return -ENOSPC; -} - -static inline int splice_branch(struct inode *inode, - Indirect chain[], - Indirect *where, - int num) -{ - int i; - - /* Verify that place we are splicing to is still there and vacant */ - write_lock(&pointers_lock); - if (!verify_chain(chain, where-1) || *where->p) - goto changed; - *where->p = where->key; - write_unlock(&pointers_lock); - - inode_set_ctime_current(inode); - - /* had we spliced it onto indirect block? */ - if (where->bh) - dirty_indirect(where->bh, inode); - - if (IS_SYNC(inode)) - sysv_sync_inode(inode); - else - mark_inode_dirty(inode); - return 0; - -changed: - write_unlock(&pointers_lock); - for (i = 1; i < num; i++) - bforget(where[i].bh); - for (i = 0; i < num; i++) - sysv_free_block(inode->i_sb, where[i].key); - return -EAGAIN; -} - -static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) -{ - int err = -EIO; - int offsets[DEPTH]; - Indirect chain[DEPTH]; - struct super_block *sb = inode->i_sb; - Indirect *partial; - int left; - int depth = block_to_path(inode, iblock, offsets); - - if (depth == 0) - goto out; - -reread: - partial = get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { -got_it: - map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb), - chain[depth-1].key)); - /* Clean up and exit */ - partial = chain+depth-1; /* the whole chain */ - goto cleanup; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) { -cleanup: - while (partial > chain) { - brelse(partial->bh); - partial--; - } -out: - return err; - } - - /* - * Indirect block might be removed by truncate while we were - * reading it. Handling of that case (forget what we've got and - * reread) is taken out of the main path. - */ - if (err == -EAGAIN) - goto changed; - - left = (chain + depth) - partial; - err = alloc_branch(inode, left, offsets+(partial-chain), partial); - if (err) - goto cleanup; - - if (splice_branch(inode, chain, partial, left) < 0) - goto changed; - - set_buffer_new(bh_result); - goto got_it; - -changed: - while (partial > chain) { - brelse(partial->bh); - partial--; - } - goto reread; -} - -static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -static Indirect *find_shared(struct inode *inode, - int depth, - int offsets[], - Indirect chain[], - sysv_zone_t *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = get_branch(inode, k, offsets, chain, &err); - - write_lock(&pointers_lock); - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) { - write_unlock(&pointers_lock); - goto no_top; - } - for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - *p->p = 0; - } - write_unlock(&pointers_lock); - - while (partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q) -{ - for ( ; p < q ; p++) { - sysv_zone_t nr = *p; - if (nr) { - *p = 0; - sysv_free_block(inode->i_sb, nr); - mark_inode_dirty(inode); - } - } -} - -static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth) -{ - struct buffer_head * bh; - struct super_block *sb = inode->i_sb; - - if (depth--) { - for ( ; p < q ; p++) { - int block; - sysv_zone_t nr = *p; - if (!nr) - continue; - *p = 0; - block = block_to_cpu(SYSV_SB(sb), nr); - bh = sb_bread(sb, block); - if (!bh) - continue; - free_branches(inode, (sysv_zone_t*)bh->b_data, - block_end(bh), depth); - bforget(bh); - sysv_free_block(sb, nr); - mark_inode_dirty(inode); - } - } else - free_data(inode, p, q); -} - -void sysv_truncate (struct inode * inode) -{ - sysv_zone_t *i_data = SYSV_I(inode)->i_data; - int offsets[DEPTH]; - Indirect chain[DEPTH]; - Indirect *partial; - sysv_zone_t nr = 0; - int n; - long iblock; - unsigned blocksize; - - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - - blocksize = inode->i_sb->s_blocksize; - iblock = (inode->i_size + blocksize-1) - >> inode->i_sb->s_blocksize_bits; - - block_truncate_page(inode->i_mapping, inode->i_size, get_block); - - n = block_to_path(inode, iblock, offsets); - if (n == 0) - return; - - if (n == 1) { - free_data(inode, i_data+offsets[0], i_data + DIRECT); - goto do_indirects; - } - - partial = find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (already detached) */ - if (nr) { - if (partial == chain) - mark_inode_dirty(inode); - else - dirty_indirect(partial->bh, inode); - free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - free_branches(inode, partial->p + 1, block_end(partial->bh), - (chain+n-1) - partial); - dirty_indirect(partial->bh, inode); - brelse (partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees (== subtrees deeper than...) */ - while (n < DEPTH) { - nr = i_data[DIRECT + n - 1]; - if (nr) { - i_data[DIRECT + n - 1] = 0; - mark_inode_dirty(inode); - free_branches(inode, &nr, &nr+1, n); - } - n++; - } - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - if (IS_SYNC(inode)) - sysv_sync_inode (inode); - else - mark_inode_dirty(inode); -} - -static unsigned sysv_nblocks(struct super_block *s, loff_t size) -{ - struct sysv_sb_info *sbi = SYSV_SB(s); - int ptrs_bits = sbi->s_ind_per_block_bits; - unsigned blocks, res, direct = DIRECT, i = DEPTH; - blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits; - res = blocks; - while (--i && blocks > direct) { - blocks = ((blocks - direct - 1) >> ptrs_bits) + 1; - res += blocks; - direct = 1; - } - return res; -} - -int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, - struct kstat *stat, u32 request_mask, unsigned int flags) -{ - struct super_block *s = path->dentry->d_sb; - generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), - stat); - stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); - stat->blksize = s->s_blocksize; - return 0; -} - -static int sysv_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return mpage_writepages(mapping, wbc, get_block); -} - -static int sysv_read_folio(struct file *file, struct folio *folio) -{ - return block_read_full_folio(folio, get_block); -} - -int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) -{ - return __block_write_begin(folio, pos, len, get_block); -} - -static void sysv_write_failed(struct address_space *mapping, loff_t to) -{ - struct inode *inode = mapping->host; - - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - sysv_truncate(inode); - } -} - -static int sysv_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) -{ - int ret; - - ret = block_write_begin(mapping, pos, len, foliop, get_block); - if (unlikely(ret)) - sysv_write_failed(mapping, pos + len); - - return ret; -} - -static sector_t sysv_bmap(struct address_space *mapping, sector_t block) -{ - return generic_block_bmap(mapping,block,get_block); -} - -const struct address_space_operations sysv_aops = { - .dirty_folio = block_dirty_folio, - .invalidate_folio = block_invalidate_folio, - .read_folio = sysv_read_folio, - .writepages = sysv_writepages, - .write_begin = sysv_write_begin, - .write_end = generic_write_end, - .migrate_folio = buffer_migrate_folio, - .bmap = sysv_bmap -}; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c deleted file mode 100644 index fb8bd8437872..000000000000 --- a/fs/sysv/namei.c +++ /dev/null @@ -1,280 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/namei.c - * - * minix/namei.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * coh/namei.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/namei.c - * Copyright (C) 1993 Bruno Haible - * Copyright (C) 1997, 1998 Krzysztof G. Baranowski - */ - -#include <linux/pagemap.h> -#include "sysv.h" - -static int add_nondir(struct dentry *dentry, struct inode *inode) -{ - int err = sysv_add_link(dentry, inode); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } - inode_dec_link_count(inode); - iput(inode); - return err; -} - -static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) -{ - struct inode * inode = NULL; - ino_t ino; - - if (dentry->d_name.len > SYSV_NAMELEN) - return ERR_PTR(-ENAMETOOLONG); - ino = sysv_inode_by_name(dentry); - if (ino) - inode = sysv_iget(dir->i_sb, ino); - return d_splice_alias(inode, dentry); -} - -static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t rdev) -{ - struct inode * inode; - int err; - - if (!old_valid_dev(rdev)) - return -EINVAL; - - inode = sysv_new_inode(dir, mode); - err = PTR_ERR(inode); - - if (!IS_ERR(inode)) { - sysv_set_inode(inode, rdev); - mark_inode_dirty(inode); - err = add_nondir(dentry, inode); - } - return err; -} - -static int sysv_create(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) -{ - return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); -} - -static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, const char *symname) -{ - int err = -ENAMETOOLONG; - int l = strlen(symname)+1; - struct inode * inode; - - if (l > dir->i_sb->s_blocksize) - goto out; - - inode = sysv_new_inode(dir, S_IFLNK|0777); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out; - - sysv_set_inode(inode, 0); - err = page_symlink(inode, symname, l); - if (err) - goto out_fail; - - mark_inode_dirty(inode); - err = add_nondir(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - iput(inode); - goto out; -} - -static int sysv_link(struct dentry * old_dentry, struct inode * dir, - struct dentry * dentry) -{ - struct inode *inode = d_inode(old_dentry); - - inode_set_ctime_current(inode); - inode_inc_link_count(inode); - ihold(inode); - - return add_nondir(dentry, inode); -} - -static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) -{ - struct inode * inode; - int err; - - inode_inc_link_count(dir); - - inode = sysv_new_inode(dir, S_IFDIR|mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_dir; - - sysv_set_inode(inode, 0); - - inode_inc_link_count(inode); - - err = sysv_make_empty(inode, dir); - if (err) - goto out_fail; - - err = sysv_add_link(dentry, inode); - if (err) - goto out_fail; - - d_instantiate(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - inode_dec_link_count(inode); - iput(inode); -out_dir: - inode_dec_link_count(dir); - goto out; -} - -static int sysv_unlink(struct inode * dir, struct dentry * dentry) -{ - struct inode * inode = d_inode(dentry); - struct folio *folio; - struct sysv_dir_entry * de; - int err; - - de = sysv_find_entry(dentry, &folio); - if (!de) - return -ENOENT; - - err = sysv_delete_entry(de, folio); - if (!err) { - inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); - inode_dec_link_count(inode); - } - folio_release_kmap(folio, de); - return err; -} - -static int sysv_rmdir(struct inode * dir, struct dentry * dentry) -{ - struct inode *inode = d_inode(dentry); - int err = -ENOTEMPTY; - - if (sysv_empty_dir(inode)) { - err = sysv_unlink(dir, dentry); - if (!err) { - inode->i_size = 0; - inode_dec_link_count(inode); - inode_dec_link_count(dir); - } - } - return err; -} - -/* - * Anybody can rename anything with this: the permission checks are left to the - * higher-level routines. - */ -static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, - struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry, unsigned int flags) -{ - struct inode * old_inode = d_inode(old_dentry); - struct inode * new_inode = d_inode(new_dentry); - struct folio *dir_folio; - struct sysv_dir_entry * dir_de = NULL; - struct folio *old_folio; - struct sysv_dir_entry * old_de; - int err = -ENOENT; - - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - old_de = sysv_find_entry(old_dentry, &old_folio); - if (!old_de) - goto out; - - if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; - dir_de = sysv_dotdot(old_inode, &dir_folio); - if (!dir_de) - goto out_old; - } - - if (new_inode) { - struct folio *new_folio; - struct sysv_dir_entry * new_de; - - err = -ENOTEMPTY; - if (dir_de && !sysv_empty_dir(new_inode)) - goto out_dir; - - err = -ENOENT; - new_de = sysv_find_entry(new_dentry, &new_folio); - if (!new_de) - goto out_dir; - err = sysv_set_link(new_de, new_folio, old_inode); - folio_release_kmap(new_folio, new_de); - if (err) - goto out_dir; - inode_set_ctime_current(new_inode); - if (dir_de) - drop_nlink(new_inode); - inode_dec_link_count(new_inode); - } else { - err = sysv_add_link(new_dentry, old_inode); - if (err) - goto out_dir; - if (dir_de) - inode_inc_link_count(new_dir); - } - - err = sysv_delete_entry(old_de, old_folio); - if (err) - goto out_dir; - - mark_inode_dirty(old_inode); - - if (dir_de) { - err = sysv_set_link(dir_de, dir_folio, new_dir); - if (!err) - inode_dec_link_count(old_dir); - } - -out_dir: - if (dir_de) - folio_release_kmap(dir_folio, dir_de); -out_old: - folio_release_kmap(old_folio, old_de); -out: - return err; -} - -/* - * directories can handle most operations... - */ -const struct inode_operations sysv_dir_inode_operations = { - .create = sysv_create, - .lookup = sysv_lookup, - .link = sysv_link, - .unlink = sysv_unlink, - .symlink = sysv_symlink, - .mkdir = sysv_mkdir, - .rmdir = sysv_rmdir, - .mknod = sysv_mknod, - .rename = sysv_rename, - .getattr = sysv_getattr, -}; diff --git a/fs/sysv/super.c b/fs/sysv/super.c deleted file mode 100644 index 5c0d07ddbda2..000000000000 --- a/fs/sysv/super.c +++ /dev/null @@ -1,595 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/fs/sysv/inode.c - * - * minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * xenix/inode.c - * Copyright (C) 1992 Doug Evans - * - * coh/inode.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/inode.c - * Copyright (C) 1993 Paul B. Monday - * - * sysv/inode.c - * Copyright (C) 1993 Bruno Haible - * Copyright (C) 1997, 1998 Krzysztof G. Baranowski - * - * This file contains code for read/parsing the superblock. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/buffer_head.h> -#include "sysv.h" - -/* - * The following functions try to recognize specific filesystems. - * - * We recognize: - * - Xenix FS by its magic number. - * - SystemV FS by its magic number. - * - Coherent FS by its funny fname/fpack field. - * - SCO AFS by s_nfree == 0xffff - * - V7 FS has no distinguishing features. - * - * We discriminate among SystemV4 and SystemV2 FS by the assumption that - * the time stamp is not < 01-01-1980. - */ - -enum { - JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60 -}; - -static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct buffer_head *bh1 = sbi->s_bh1; - struct buffer_head *bh2 = sbi->s_bh2; - struct xenix_super_block * sbd1; - struct xenix_super_block * sbd2; - - if (bh1 != bh2) - sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data; - else { - /* block size = 512, so bh1 != bh2 */ - sbd1 = (struct xenix_super_block *) bh1->b_data; - sbd2 = (struct xenix_super_block *) (bh2->b_data - 512); - } - - *max_links = XENIX_LINK_MAX; - sbi->s_fic_size = XENIX_NICINOD; - sbi->s_flc_size = XENIX_NICFREE; - sbi->s_sbd1 = (char *)sbd1; - sbi->s_sbd2 = (char *)sbd2; - sbi->s_sb_fic_count = &sbd1->s_ninode; - sbi->s_sb_fic_inodes = &sbd1->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd2->s_tinode; - sbi->s_bcache_count = &sbd1->s_nfree; - sbi->s_bcache = &sbd1->s_free[0]; - sbi->s_free_blocks = &sbd2->s_tfree; - sbi->s_sb_time = &sbd2->s_time; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize); -} - -static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct sysv4_super_block * sbd; - struct buffer_head *bh1 = sbi->s_bh1; - struct buffer_head *bh2 = sbi->s_bh2; - - if (bh1 == bh2) - sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2); - else - sbd = (struct sysv4_super_block *) bh2->b_data; - - *max_links = SYSV_LINK_MAX; - sbi->s_fic_size = SYSV_NICINOD; - sbi->s_flc_size = SYSV_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_sb_state = &sbd->s_state; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct sysv2_super_block *sbd; - struct buffer_head *bh1 = sbi->s_bh1; - struct buffer_head *bh2 = sbi->s_bh2; - - if (bh1 == bh2) - sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2); - else - sbd = (struct sysv2_super_block *) bh2->b_data; - - *max_links = SYSV_LINK_MAX; - sbi->s_fic_size = SYSV_NICINOD; - sbi->s_flc_size = SYSV_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_sb_state = &sbd->s_state; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct coh_super_block * sbd; - struct buffer_head *bh1 = sbi->s_bh1; - - sbd = (struct coh_super_block *) bh1->b_data; - - *max_links = COH_LINK_MAX; - sbi->s_fic_size = COH_NICINOD; - sbi->s_flc_size = COH_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct buffer_head *bh2 = sbi->s_bh2; - struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data; - - *max_links = V7_LINK_MAX; - sbi->s_fic_size = V7_NICINOD; - sbi->s_flc_size = V7_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data; - if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544)) - sbi->s_bytesex = BYTESEX_LE; - else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544)) - sbi->s_bytesex = BYTESEX_BE; - else - return 0; - switch (fs32_to_cpu(sbi, sbd->s_type)) { - case 1: - sbi->s_type = FSTYPE_XENIX; - return 1; - case 2: - sbi->s_type = FSTYPE_XENIX; - return 2; - default: - return 0; - } -} - -static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - struct super_block *sb = sbi->s_sb; - /* All relevant fields are at the same offsets in R2 and R4 */ - struct sysv4_super_block * sbd; - u32 type; - - sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2); - if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20)) - sbi->s_bytesex = BYTESEX_LE; - else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20)) - sbi->s_bytesex = BYTESEX_BE; - else - return 0; - - type = fs32_to_cpu(sbi, sbd->s_type); - - if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) { - sbi->s_type = FSTYPE_AFS; - sbi->s_forced_ro = 1; - if (!sb_rdonly(sb)) { - printk("SysV FS: SCO EAFS on %s detected, " - "forcing read-only mode.\n", - sb->s_id); - } - return type; - } - - if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) { - /* this is likely to happen on SystemV2 FS */ - if (type > 3 || type < 1) - return 0; - sbi->s_type = FSTYPE_SYSV2; - return type; - } - if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10)) - return 0; - - /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10, - 0x20 or 0x30 indicates that symbolic links and the 14-character - filename limit is gone. Due to lack of information about this - feature read-only mode seems to be a reasonable approach... -KGB */ - - if (type >= 0x10) { - printk("SysV FS: can't handle long file names on %s, " - "forcing read-only mode.\n", sb->s_id); - sbi->s_forced_ro = 1; - } - - sbi->s_type = FSTYPE_SYSV4; - return type >= 0x10 ? type >> 4 : type; -} - -static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - struct coh_super_block * sbd; - - sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2); - if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6)) - || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6))) - return 0; - sbi->s_bytesex = BYTESEX_PDP; - sbi->s_type = FSTYPE_COH; - return 1; -} - -static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - int size = detect_sysv(sbi, bh); - - return size>2 ? 0 : size; -} - -static struct { - int block; - int (*test)(struct sysv_sb_info *, struct buffer_head *); -} flavours[] = { - {1, detect_xenix}, - {0, detect_sysv}, - {0, detect_coherent}, - {9, detect_sysv_odd}, - {15,detect_sysv_odd}, - {18,detect_sysv}, -}; - -static char *flavour_names[] = { - [FSTYPE_XENIX] = "Xenix", - [FSTYPE_SYSV4] = "SystemV", - [FSTYPE_SYSV2] = "SystemV Release 2", - [FSTYPE_COH] = "Coherent", - [FSTYPE_V7] = "V7", - [FSTYPE_AFS] = "AFS", -}; - -static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = { - [FSTYPE_XENIX] = detected_xenix, - [FSTYPE_SYSV4] = detected_sysv4, - [FSTYPE_SYSV2] = detected_sysv2, - [FSTYPE_COH] = detected_coherent, - [FSTYPE_V7] = detected_v7, - [FSTYPE_AFS] = detected_sysv4, -}; - -static int complete_read_super(struct super_block *sb, int silent, int size) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct inode *root_inode; - char *found = flavour_names[sbi->s_type]; - u_char n_bits = size+8; - int bsize = 1 << n_bits; - int bsize_4 = bsize >> 2; - - sbi->s_firstinodezone = 2; - - flavour_setup[sbi->s_type](sbi, &sb->s_max_links); - if (sbi->s_firstdatazone < sbi->s_firstinodezone) - return 0; - - sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; - sbi->s_inodes_per_block = bsize >> 6; - sbi->s_inodes_per_block_1 = (bsize >> 6)-1; - sbi->s_inodes_per_block_bits = n_bits-6; - sbi->s_ind_per_block = bsize_4; - sbi->s_ind_per_block_2 = bsize_4*bsize_4; - sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4)); - sbi->s_ind_per_block_bits = n_bits-2; - - sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone) - << sbi->s_inodes_per_block_bits; - - if (!silent) - printk("VFS: Found a %s FS (block size = %ld) on device %s\n", - found, sb->s_blocksize, sb->s_id); - - sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; - /* set up enough so that it can read an inode */ - sb->s_op = &sysv_sops; - if (sbi->s_forced_ro) - sb->s_flags |= SB_RDONLY; - root_inode = sysv_iget(sb, SYSV_ROOT_INO); - if (IS_ERR(root_inode)) { - printk("SysV FS: get root inode failed\n"); - return 0; - } - sb->s_root = d_make_root(root_inode); - if (!sb->s_root) { - printk("SysV FS: get root dentry failed\n"); - return 0; - } - return 1; -} - -static int sysv_fill_super(struct super_block *sb, void *data, int silent) -{ - struct buffer_head *bh1, *bh = NULL; - struct sysv_sb_info *sbi; - unsigned long blocknr; - int size = 0, i; - - BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block)); - BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block)); - BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block)); - BUILD_BUG_ON(500 != sizeof (struct coh_super_block)); - BUILD_BUG_ON(64 != sizeof (struct sysv_inode)); - - sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sbi->s_sb = sb; - sbi->s_block_base = 0; - mutex_init(&sbi->s_lock); - sb->s_fs_info = sbi; - sb->s_time_min = 0; - sb->s_time_max = U32_MAX; - sb_set_blocksize(sb, BLOCK_SIZE); - - for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) { - brelse(bh); - bh = sb_bread(sb, flavours[i].block); - if (!bh) - continue; - size = flavours[i].test(SYSV_SB(sb), bh); - } - - if (!size) - goto Eunknown; - - switch (size) { - case 1: - blocknr = bh->b_blocknr << 1; - brelse(bh); - sb_set_blocksize(sb, 512); - bh1 = sb_bread(sb, blocknr); - bh = sb_bread(sb, blocknr + 1); - break; - case 2: - bh1 = bh; - break; - case 3: - blocknr = bh->b_blocknr >> 1; - brelse(bh); - sb_set_blocksize(sb, 2048); - bh1 = bh = sb_bread(sb, blocknr); - break; - default: - goto Ebadsize; - } - - if (bh && bh1) { - sbi->s_bh1 = bh1; - sbi->s_bh2 = bh; - if (complete_read_super(sb, silent, size)) - return 0; - } - - brelse(bh1); - brelse(bh); - sb_set_blocksize(sb, BLOCK_SIZE); - printk("oldfs: cannot read superblock\n"); -failed: - kfree(sbi); - return -EINVAL; - -Eunknown: - brelse(bh); - if (!silent) - printk("VFS: unable to find oldfs superblock on device %s\n", - sb->s_id); - goto failed; -Ebadsize: - brelse(bh); - if (!silent) - printk("VFS: oldfs: unsupported block size (%dKb)\n", - 1<<(size-2)); - goto failed; -} - -static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh) -{ - struct v7_super_block *v7sb; - struct sysv_inode *v7i; - struct buffer_head *bh2; - struct sysv_sb_info *sbi; - - sbi = sb->s_fs_info; - - /* plausibility check on superblock */ - v7sb = (struct v7_super_block *) bh->b_data; - if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || - fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || - fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE) - return 0; - - /* plausibility check on root inode: it is a directory, - with a nonzero size that is a multiple of 16 */ - bh2 = sb_bread(sb, 2); - if (bh2 == NULL) - return 0; - - v7i = (struct sysv_inode *)(bh2->b_data + 64); - if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR || - (fs32_to_cpu(sbi, v7i->i_size) == 0) || - (fs32_to_cpu(sbi, v7i->i_size) & 017) || - (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES * - sizeof(struct sysv_dir_entry))) { - brelse(bh2); - return 0; - } - - brelse(bh2); - return 1; -} - -static int v7_fill_super(struct super_block *sb, void *data, int silent) -{ - struct sysv_sb_info *sbi; - struct buffer_head *bh; - - BUILD_BUG_ON(sizeof(struct v7_super_block) != 440); - BUILD_BUG_ON(sizeof(struct sysv_inode) != 64); - - sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sbi->s_sb = sb; - sbi->s_block_base = 0; - sbi->s_type = FSTYPE_V7; - mutex_init(&sbi->s_lock); - sb->s_fs_info = sbi; - sb->s_time_min = 0; - sb->s_time_max = U32_MAX; - - sb_set_blocksize(sb, 512); - - if ((bh = sb_bread(sb, 1)) == NULL) { - if (!silent) - printk("VFS: unable to read V7 FS superblock on " - "device %s.\n", sb->s_id); - goto failed; - } - - /* Try PDP-11 UNIX */ - sbi->s_bytesex = BYTESEX_PDP; - if (v7_sanity_check(sb, bh)) - goto detected; - - /* Try PC/IX, v7/x86 */ - sbi->s_bytesex = BYTESEX_LE; - if (v7_sanity_check(sb, bh)) - goto detected; - - goto failed; - -detected: - sbi->s_bh1 = bh; - sbi->s_bh2 = bh; - if (complete_read_super(sb, silent, 1)) - return 0; - -failed: - printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n", - sb->s_id); - brelse(bh); - kfree(sbi); - return -EINVAL; -} - -/* Every kernel module contains stuff like this. */ - -static struct dentry *sysv_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); -} - -static struct dentry *v7_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); -} - -static struct file_system_type sysv_fs_type = { - .owner = THIS_MODULE, - .name = "sysv", - .mount = sysv_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("sysv"); - -static struct file_system_type v7_fs_type = { - .owner = THIS_MODULE, - .name = "v7", - .mount = v7_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("v7"); -MODULE_ALIAS("v7"); - -static int __init init_sysv_fs(void) -{ - int error; - - error = sysv_init_icache(); - if (error) - goto out; - error = register_filesystem(&sysv_fs_type); - if (error) - goto destroy_icache; - error = register_filesystem(&v7_fs_type); - if (error) - goto unregister; - return 0; - -unregister: - unregister_filesystem(&sysv_fs_type); -destroy_icache: - sysv_destroy_icache(); -out: - return error; -} - -static void __exit exit_sysv_fs(void) -{ - unregister_filesystem(&sysv_fs_type); - unregister_filesystem(&v7_fs_type); - sysv_destroy_icache(); -} - -module_init(init_sysv_fs) -module_exit(exit_sysv_fs) -MODULE_DESCRIPTION("SystemV Filesystem"); -MODULE_LICENSE("GPL"); diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h deleted file mode 100644 index 0a48b2e7edb1..000000000000 --- a/fs/sysv/sysv.h +++ /dev/null @@ -1,245 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SYSV_H -#define _SYSV_H - -#include <linux/buffer_head.h> - -typedef __u16 __bitwise __fs16; -typedef __u32 __bitwise __fs32; - -#include <linux/sysv_fs.h> - -/* - * SystemV/V7/Coherent super-block data in memory - * - * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified - * while the system is running). This is in contrast to the Minix and Berkeley - * filesystems (where the superblock is never modified). This affects the - * sync() operation: we must keep the superblock in a disk buffer and use this - * one as our "working copy". - */ - -struct sysv_sb_info { - struct super_block *s_sb; /* VFS superblock */ - int s_type; /* file system type: FSTYPE_{XENIX|SYSV|COH} */ - char s_bytesex; /* bytesex (le/be/pdp) */ - unsigned int s_inodes_per_block; /* number of inodes per block */ - unsigned int s_inodes_per_block_1; /* inodes_per_block - 1 */ - unsigned int s_inodes_per_block_bits; /* log2(inodes_per_block) */ - unsigned int s_ind_per_block; /* number of indirections per block */ - unsigned int s_ind_per_block_bits; /* log2(ind_per_block) */ - unsigned int s_ind_per_block_2; /* ind_per_block ^ 2 */ - unsigned int s_toobig_block; /* 10 + ipb + ipb^2 + ipb^3 */ - unsigned int s_block_base; /* physical block number of block 0 */ - unsigned short s_fic_size; /* free inode cache size, NICINOD */ - unsigned short s_flc_size; /* free block list chunk size, NICFREE */ - /* The superblock is kept in one or two disk buffers: */ - struct buffer_head *s_bh1; - struct buffer_head *s_bh2; - /* These are pointers into the disk buffer, to compensate for - different superblock layout. */ - char * s_sbd1; /* entire superblock data, for part 1 */ - char * s_sbd2; /* entire superblock data, for part 2 */ - __fs16 *s_sb_fic_count; /* pointer to s_sbd->s_ninode */ - sysv_ino_t *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */ - __fs16 *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */ - __fs16 *s_bcache_count; /* pointer to s_sbd->s_nfree */ - sysv_zone_t *s_bcache; /* pointer to s_sbd->s_free */ - __fs32 *s_free_blocks; /* pointer to s_sbd->s_tfree */ - __fs32 *s_sb_time; /* pointer to s_sbd->s_time */ - __fs32 *s_sb_state; /* pointer to s_sbd->s_state, only FSTYPE_SYSV */ - /* We keep those superblock entities that don't change here; - this saves us an indirection and perhaps a conversion. */ - u32 s_firstinodezone; /* index of first inode zone */ - u32 s_firstdatazone; /* same as s_sbd->s_isize */ - u32 s_ninodes; /* total number of inodes */ - u32 s_ndatazones; /* total number of data zones */ - u32 s_nzones; /* same as s_sbd->s_fsize */ - u16 s_namelen; /* max length of dir entry */ - int s_forced_ro; - struct mutex s_lock; -}; - -/* - * SystemV/V7/Coherent FS inode data in memory - */ -struct sysv_inode_info { - __fs32 i_data[13]; - u32 i_dir_start_lookup; - struct inode vfs_inode; -}; - - -static inline struct sysv_inode_info *SYSV_I(struct inode *inode) -{ - return container_of(inode, struct sysv_inode_info, vfs_inode); -} - -static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} - - -/* identify the FS in memory */ -enum { - FSTYPE_NONE = 0, - FSTYPE_XENIX, - FSTYPE_SYSV4, - FSTYPE_SYSV2, - FSTYPE_COH, - FSTYPE_V7, - FSTYPE_AFS, - FSTYPE_END, -}; - -#define SYSV_MAGIC_BASE 0x012FF7B3 - -#define XENIX_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_XENIX) -#define SYSV4_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV4) -#define SYSV2_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV2) -#define COH_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_COH) - - -/* Admissible values for i_nlink: 0.._LINK_MAX */ -enum { - XENIX_LINK_MAX = 126, /* ?? */ - SYSV_LINK_MAX = 126, /* 127? 251? */ - V7_LINK_MAX = 126, /* ?? */ - COH_LINK_MAX = 10000, -}; - - -static inline void dirty_sb(struct super_block *sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - mark_buffer_dirty(sbi->s_bh1); - if (sbi->s_bh1 != sbi->s_bh2) - mark_buffer_dirty(sbi->s_bh2); -} - - -/* ialloc.c */ -extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned, - struct buffer_head **); -extern struct inode * sysv_new_inode(const struct inode *, umode_t); -extern void sysv_free_inode(struct inode *); -extern unsigned long sysv_count_free_inodes(struct super_block *); - -/* balloc.c */ -extern sysv_zone_t sysv_new_block(struct super_block *); -extern void sysv_free_block(struct super_block *, sysv_zone_t); -extern unsigned long sysv_count_free_blocks(struct super_block *); - -/* itree.c */ -void sysv_truncate(struct inode *); -int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); - -/* inode.c */ -extern struct inode *sysv_iget(struct super_block *, unsigned int); -extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); -extern int sysv_sync_inode(struct inode *); -extern void sysv_set_inode(struct inode *, dev_t); -extern int sysv_getattr(struct mnt_idmap *, const struct path *, - struct kstat *, u32, unsigned int); -extern int sysv_init_icache(void); -extern void sysv_destroy_icache(void); - - -/* dir.c */ -struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **); -int sysv_add_link(struct dentry *, struct inode *); -int sysv_delete_entry(struct sysv_dir_entry *, struct folio *); -int sysv_make_empty(struct inode *, struct inode *); -int sysv_empty_dir(struct inode *); -int sysv_set_link(struct sysv_dir_entry *, struct folio *, - struct inode *); -struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **); -ino_t sysv_inode_by_name(struct dentry *); - - -extern const struct inode_operations sysv_file_inode_operations; -extern const struct inode_operations sysv_dir_inode_operations; -extern const struct file_operations sysv_file_operations; -extern const struct file_operations sysv_dir_operations; -extern const struct address_space_operations sysv_aops; -extern const struct super_operations sysv_sops; - - -enum { - BYTESEX_LE, - BYTESEX_PDP, - BYTESEX_BE, -}; - -static inline u32 PDP_swab(u32 x) -{ -#ifdef __LITTLE_ENDIAN - return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16); -#else -#ifdef __BIG_ENDIAN - return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8); -#else -#error BYTESEX -#endif -#endif -} - -static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n) -{ - if (sbi->s_bytesex == BYTESEX_PDP) - return PDP_swab((__force __u32)n); - else if (sbi->s_bytesex == BYTESEX_LE) - return le32_to_cpu((__force __le32)n); - else - return be32_to_cpu((__force __be32)n); -} - -static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n) -{ - if (sbi->s_bytesex == BYTESEX_PDP) - return (__force __fs32)PDP_swab(n); - else if (sbi->s_bytesex == BYTESEX_LE) - return (__force __fs32)cpu_to_le32(n); - else - return (__force __fs32)cpu_to_be32(n); -} - -static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d) -{ - if (sbi->s_bytesex == BYTESEX_PDP) - *(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d); - else if (sbi->s_bytesex == BYTESEX_LE) - le32_add_cpu((__le32 *)n, d); - else - be32_add_cpu((__be32 *)n, d); - return *n; -} - -static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n) -{ - if (sbi->s_bytesex != BYTESEX_BE) - return le16_to_cpu((__force __le16)n); - else - return be16_to_cpu((__force __be16)n); -} - -static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n) -{ - if (sbi->s_bytesex != BYTESEX_BE) - return (__force __fs16)cpu_to_le16(n); - else - return (__force __fs16)cpu_to_be16(n); -} - -static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d) -{ - if (sbi->s_bytesex != BYTESEX_BE) - le16_add_cpu((__le16 *)n, d); - else - be16_add_cpu((__be16 *)n, d); - return *n; -} - -#endif /* _SYSV_H */ diff --git a/fs/timerfd.c b/fs/timerfd.c index 9f7eb451a60f..c68f28d9c426 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -205,9 +205,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); } else { - hrtimer_init(&ctx->t.tmr, clockid, htmode); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); - ctx->t.tmr.function = timerfd_tmrproc; } if (texp != 0) { @@ -429,7 +428,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); else - hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS); ctx->moffs = ktime_mono_to_real(0); @@ -439,15 +438,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return ufd; } - file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx, - O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); + file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), + FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } - file->f_mode |= FMODE_NOWAIT; fd_install(ufd, file); return ufd; } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 53214499e384..cb1af30b49f5 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -109,9 +109,9 @@ static char *get_dname(struct dentry *dentry) return name; } -static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, - struct inode *inode, struct dentry *dentry, - umode_t mode) +static struct dentry *tracefs_syscall_mkdir(struct mnt_idmap *idmap, + struct inode *inode, struct dentry *dentry, + umode_t mode) { struct tracefs_inode *ti; char *name; @@ -119,7 +119,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, name = get_dname(dentry); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* * This is a new directory that does not take the default of @@ -141,7 +141,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, kfree(name); - return ret; + return ERR_PTR(ret); } static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index fda82f3e16e8..3c3d3ad4fa6c 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1002,8 +1002,8 @@ out_fname: return err; } -static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -1023,7 +1023,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, err = ubifs_budget_space(c, &req); if (err) - return err; + return ERR_PTR(err); err = ubifs_prepare_create(dir, dentry, &nm); if (err) @@ -1060,7 +1060,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, ubifs_release_budget(c, &req); d_instantiate(dentry, inode); fscrypt_free_filename(&nm); - return 0; + return NULL; out_cancel: dir->i_size -= sz_change; @@ -1074,7 +1074,7 @@ out_fname: fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); - return err; + return ERR_PTR(err); } static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 01d8eb170382..a79f229df475 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -1179,8 +1179,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->c = c; wbuf->next_ino = 0; - hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wbuf->timer.function = wbuf_timer_callback_nolock; + hrtimer_setup(&wbuf->timer, wbuf_timer_callback_nolock, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return 0; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 2cb49b6b0716..5f2e9a892bff 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -419,8 +419,8 @@ static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir, return udf_add_nondir(dentry, inode); } -static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct udf_fileident_iter iter; @@ -430,7 +430,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, inode = udf_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; @@ -439,7 +439,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (err) { clear_nlink(inode); discard_new_inode(inode); - return err; + return ERR_PTR(err); } set_nlink(inode, 2); iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); @@ -456,7 +456,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (err) { clear_nlink(inode); discard_new_inode(inode); - return err; + return ERR_PTR(err); } iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); @@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, mark_inode_dirty(dir); d_instantiate_new(dentry, inode); - return 0; + return NULL; } static int empty_dir(struct inode *dir) diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 38a024c8cccd..5b3c85c93242 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -166,8 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, - struct dentry * dentry, umode_t mode) +static struct dentry *ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, + struct dentry * dentry, umode_t mode) { struct inode * inode; int err; @@ -194,7 +194,7 @@ static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, goto out_fail; d_instantiate_new(dentry, inode); - return 0; + return NULL; out_fail: inode_dec_link_count(inode); @@ -202,7 +202,7 @@ out_fail: discard_new_inode(inode); out_dir: inode_dec_link_count(dir); - return err; + return ERR_PTR(err); } static int ufs_unlink(struct inode *dir, struct dentry *dentry) diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index da786a687fdc..4ad2c36550f1 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -10,6 +10,7 @@ config UNICODE be a separate loadable module that gets requested only when a file system actually use it. -config UNICODE_NORMALIZATION_SELFTEST +config UNICODE_NORMALIZATION_KUNIT_TEST tristate "Test UTF-8 normalization support" - depends on UNICODE + depends on UNICODE && KUNIT + default KUNIT_ALL_TESTS diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index e309afe2b2bb..d95be7fb9f6b 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -4,7 +4,7 @@ ifneq ($(CONFIG_UNICODE),) obj-y += unicode.o endif obj-$(CONFIG_UNICODE) += utf8data.o -obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o +obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += tests/utf8_kunit.o unicode-y := utf8-norm.o utf8-core.o diff --git a/fs/unicode/tests/.kunitconfig b/fs/unicode/tests/.kunitconfig new file mode 100644 index 000000000000..62dd5c171f9c --- /dev/null +++ b/fs/unicode/tests/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_UNICODE=y +CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST=y diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/tests/utf8_kunit.c index 5ddaf27b21a6..5063e8138aec 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/tests/utf8_kunit.c @@ -1,34 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Kernel module for testing utf-8 support. + * KUnit tests for utf-8 support. * * Copyright 2017 Collabora Ltd. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/module.h> -#include <linux/printk.h> #include <linux/unicode.h> -#include <linux/dcache.h> - -#include "utf8n.h" - -static unsigned int failed_tests; -static unsigned int total_tests; - -#define _test(cond, func, line, fmt, ...) do { \ - total_tests++; \ - if (!cond) { \ - failed_tests++; \ - pr_err("test %s:%d Failed: %s%s", \ - func, line, #cond, (fmt?":":".")); \ - if (fmt) \ - pr_err(fmt, ##__VA_ARGS__); \ - } \ - } while (0) -#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) -#define test(cond) _test(cond, __func__, __LINE__, "") +#include <kunit/test.h> + +#include "../utf8n.h" static const struct { /* UTF-8 strings in this vector _must_ be NULL-terminated. */ @@ -167,69 +147,74 @@ static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, return utf8ncursor(u8c, um, n, s, (unsigned int)-1); } -static void check_utf8_nfdi(struct unicode_map *um) +static void check_utf8_nfdi(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { int len = strlen(nfdi_test_data[i].str); int nlen = strlen(nfdi_test_data[i].dec); int j = 0; unsigned char c; + int ret; + + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len), + nlen); - test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); - test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == - nlen)); - if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdi_test_data[i].dec[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdi_test_data[i].dec[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdi_test_data[i].dec[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_nfdicf(struct unicode_map *um) +static void check_utf8_nfdicf(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { int len = strlen(nfdicf_test_data[i].str); int nlen = strlen(nfdicf_test_data[i].ncf); int j = 0; + int ret; unsigned char c; - test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == - nlen)); - test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == - nlen)); + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str), + nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len), + nlen); - if (utf8cursor(&u8c, um, UTF8_NFDICF, - nfdicf_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdicf_test_data[i].ncf[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdicf_test_data[i].ncf[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdicf_test_data[i].ncf[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_comparisons(struct unicode_map *table) +static void check_utf8_comparisons(struct kunit *test) { int i; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { const struct qstr s1 = {.name = nfdi_test_data[i].str, @@ -237,8 +222,9 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdi_test_data[i].dec, .len = sizeof(nfdi_test_data[i].dec)}; - test_f(!utf8_strncmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { @@ -247,62 +233,65 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, .len = sizeof(nfdicf_test_data[i].ncf)}; - test_f(!utf8_strncasecmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncasecmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } } -static void check_supported_versions(struct unicode_map *um) +static void check_supported_versions(struct kunit *test) { + struct unicode_map *um = test->priv; /* Unicode 7.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); /* Unicode 9.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); /* Unicode 1x.0.0 (the latest version) should be supported. */ - test(utf8version_is_supported(um, UTF8_LATEST)); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST)); /* Next versions don't exist. */ - test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); } -static int __init init_test_ucd(void) +static struct kunit_case unicode_normalization_test_cases[] = { + KUNIT_CASE(check_supported_versions), + KUNIT_CASE(check_utf8_comparisons), + KUNIT_CASE(check_utf8_nfdicf), + KUNIT_CASE(check_utf8_nfdi), + {} +}; + +static int init_test_ucd(struct kunit *test) { - struct unicode_map *um; + struct unicode_map *um = utf8_load(UTF8_LATEST); - failed_tests = 0; - total_tests = 0; + test->priv = um; - um = utf8_load(UTF8_LATEST); - if (IS_ERR(um)) { - pr_err("%s: Unable to load utf8 table.\n", __func__); - return PTR_ERR(um); - } + KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0, + "%s: Unable to load utf8 table.\n", __func__); - check_supported_versions(um); - check_utf8_nfdi(um); - check_utf8_nfdicf(um); - check_utf8_comparisons(um); - - if (!failed_tests) - pr_info("All %u tests passed\n", total_tests); - else - pr_err("%u out of %u tests failed\n", failed_tests, - total_tests); - utf8_unload(um); return 0; } -static void __exit exit_test_ucd(void) +static void exit_test_ucd(struct kunit *test) { + utf8_unload(test->priv); } -module_init(init_test_ucd); -module_exit(exit_test_ucd); +static struct kunit_suite unicode_normalization_test_suite = { + .name = "unicode_normalization", + .test_cases = unicode_normalization_test_cases, + .init = init_test_ucd, + .exit = exit_test_ucd, +}; + +kunit_test_suite(unicode_normalization_test_suite); + MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); -MODULE_DESCRIPTION("Kernel module for testing utf-8 support"); +MODULE_DESCRIPTION("KUnit tests for utf-8 support."); MODULE_LICENSE("GPL"); diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 768f8ab448b8..7b998c99c88d 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c @@ -586,7 +586,7 @@ ccc_mismatch: } } -#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE +#if IS_MODULE(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) EXPORT_SYMBOL_GPL(utf8version_is_supported); EXPORT_SYMBOL_GPL(utf8nlen); EXPORT_SYMBOL_GPL(utf8ncursor); diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index a859ac9b74ba..770e29ec3557 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -303,11 +303,11 @@ static int vboxsf_dir_mkfile(struct mnt_idmap *idmap, return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL); } -static int vboxsf_dir_mkdir(struct mnt_idmap *idmap, - struct inode *parent, struct dentry *dentry, - umode_t mode) +static struct dentry *vboxsf_dir_mkdir(struct mnt_idmap *idmap, + struct inode *parent, struct dentry *dentry, + umode_t mode) { - return vboxsf_dir_create(parent, dentry, mode, true, true, NULL); + return ERR_PTR(vboxsf_dir_create(parent, dentry, mode, true, true, NULL)); } static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry, diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index e1036e535352..40569d3527a7 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -4,13 +4,9 @@ config FS_VERITY bool "FS Verity (read-only file-based authenticity protection)" select CRYPTO select CRYPTO_HASH_INFO - # SHA-256 is implied as it's intended to be the default hash algorithm. + # SHA-256 is selected as it's intended to be the default hash algorithm. # To avoid bloat, other wanted algorithms must be selected explicitly. - # Note that CRYPTO_SHA256 denotes the generic C implementation, but - # some architectures provided optimized implementations of the same - # algorithm that may be used instead. In this case, CRYPTO_SHA256 may - # be omitted even if SHA-256 is being used. - imply CRYPTO_SHA256 + select CRYPTO_SHA256 help This option enables fs-verity. fs-verity is the dm-verity mechanism implemented at the file level. On supported diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7afa51e41427..5bf501cf8271 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ xfs_rtbitmap.o \ xfs_rtgroup.o \ + xfs_zones.o \ ) # highlevel code @@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_quotaops.o # xfs_rtbitmap is shared with libxfs -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ + xfs_zone_alloc.o \ + xfs_zone_gc.o \ + xfs_zone_info.o \ + xfs_zone_space_resv.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index b59cb461e096..e6ba914f6d06 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -301,7 +301,7 @@ xfs_get_aghdr_buf( struct xfs_buf *bp; int error; - error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0ef19f1469ec..63255820b58a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,13 +34,13 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_health.h" #include "xfs_bmap_item.h" #include "xfs_symlink_remote.h" #include "xfs_inode_util.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -171,18 +171,16 @@ xfs_bmbt_update( * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ -STATIC xfs_filblks_t +xfs_filblks_t xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ + struct xfs_inode *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ { - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ + struct xfs_mount *mp = ip->i_mount; + int maxrecs = mp->m_bmap_dmxr[0]; + int level; + xfs_filblks_t rval; - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); level++) { @@ -2572,146 +2570,6 @@ done: } /* - * Convert a hole to a delayed allocation. - */ -STATIC void -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork, - struct xfs_iext_cursor *icur, - xfs_bmbt_irec_t *new) /* new data to add to file extents */ -{ - struct xfs_ifork *ifp; /* inode fork pointer */ - xfs_bmbt_irec_t left; /* left neighbor extent entry */ - xfs_filblks_t newlen=0; /* new indirect size */ - xfs_filblks_t oldlen=0; /* old indirect size */ - xfs_bmbt_irec_t right; /* right neighbor extent entry */ - uint32_t state = xfs_bmap_fork_to_state(whichfork); - xfs_filblks_t temp; /* temp for indirect calculations */ - - ifp = xfs_ifork_ptr(ip, whichfork); - ASSERT(isnullstartblock(new->br_startblock)); - - /* - * Check and set flags if this segment has a left neighbor - */ - if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { - state |= BMAP_LEFT_VALID; - if (isnullstartblock(left.br_startblock)) - state |= BMAP_LEFT_DELAY; - } - - /* - * Check and set flags if the current (right) segment exists. - * If it doesn't exist, we're converting the hole at end-of-file. - */ - if (xfs_iext_get_extent(ifp, icur, &right)) { - state |= BMAP_RIGHT_VALID; - if (isnullstartblock(right.br_startblock)) - state |= BMAP_RIGHT_DELAY; - } - - /* - * Set contiguity flags on the left and right neighbors. - * Don't let extents get too large, even if the pieces are contiguous. - */ - if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) - state |= BMAP_LEFT_CONTIG; - - if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && - (!(state & BMAP_LEFT_CONTIG) || - (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) - state |= BMAP_RIGHT_CONTIG; - - /* - * Switch out based on the contiguity flags. - */ - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with delayed allocations - * on the left and on the right. - * Merge all three into a single extent record. - */ - temp = left.br_blockcount + new->br_blockcount + - right.br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_startblock = nullstartblock(newlen); - left.br_blockcount = temp; - - xfs_iext_remove(ip, icur, state); - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_LEFT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the left. - * Merge the new allocation with the left neighbor. - */ - temp = left.br_blockcount + new->br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_blockcount = temp; - left.br_startblock = nullstartblock(newlen); - - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the right. - * Merge the new allocation with the right neighbor. - */ - temp = new->br_blockcount + right.br_blockcount; - oldlen = startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - right.br_startoff = new->br_startoff; - right.br_startblock = nullstartblock(newlen); - right.br_blockcount = temp; - xfs_iext_update_extent(ip, state, icur, &right); - break; - - case 0: - /* - * New allocation is not contiguous with another - * delayed allocation. - * Insert a new entry. - */ - oldlen = newlen = 0; - xfs_iext_insert(ip, icur, new, state); - break; - } - if (oldlen != newlen) { - ASSERT(oldlen > newlen); - xfs_add_fdblocks(ip->i_mount, oldlen - newlen); - - /* - * Nothing to do for disk quota accounting here. - */ - xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); - } -} - -/* * Convert a hole to a real allocation. */ STATIC int /* error */ @@ -4039,144 +3897,6 @@ xfs_bmapi_read( return 0; } -/* - * Add a delayed allocation extent to an inode. Blocks are reserved from the - * global pool and the extent inserted into the inode in-core extent tree. - * - * On entry, got refers to the first extent beyond the offset of the extent to - * allocate or eof is specified if no such extent exists. On return, got refers - * to the extent record that was inserted to the inode fork. - * - * Note that the allocated extent may have been merged with contiguous extents - * during insertion into the inode fork. Thus, got does not reflect the current - * state of the inode fork on return. If necessary, the caller can use lastx to - * look up the updated record in the inode fork. - */ -int -xfs_bmapi_reserve_delalloc( - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t off, - xfs_filblks_t len, - xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, - struct xfs_iext_cursor *icur, - int eof) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - xfs_extlen_t alen; - xfs_extlen_t indlen; - uint64_t fdblocks; - int error; - xfs_fileoff_t aoff; - bool use_cowextszhint = - whichfork == XFS_COW_FORK && !prealloc; - -retry: - /* - * Cap the alloc length. Keep track of prealloc so we know whether to - * tag the inode before we return. - */ - aoff = off; - alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); - if (!eof) - alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); - if (prealloc && alen >= len) - prealloc = alen - len; - - /* - * If we're targetting the COW fork but aren't creating a speculative - * posteof preallocation, try to expand the reservation to align with - * the COW extent size hint if there's sufficient free space. - * - * Unlike the data fork, the CoW cancellation functions will free all - * the reservations at inactivation, so we don't require that every - * delalloc reservation have a dirty pagecache. - */ - if (use_cowextszhint) { - struct xfs_bmbt_irec prev; - xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); - - if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) - prev.br_startoff = NULLFILEOFF; - - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, - 1, 0, &aoff, &alen); - ASSERT(!error); - } - - /* - * Make a transaction-less quota reservation for delayed allocation - * blocks. This number gets adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_quota_reserve_blkres(ip, alen); - if (error) - goto out; - - /* - * Split changing sb for alen and indlen since they could be coming - * from different places. - */ - indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - fdblocks = indlen; - if (XFS_IS_REALTIME_INODE(ip)) { - error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); - if (error) - goto out_unreserve_quota; - } else { - fdblocks += alen; - } - - error = xfs_dec_fdblocks(mp, fdblocks, false); - if (error) - goto out_unreserve_frextents; - - ip->i_delayed_blks += alen; - xfs_mod_delalloc(ip, alen, indlen); - - got->br_startoff = aoff; - got->br_startblock = nullstartblock(indlen); - got->br_blockcount = alen; - got->br_state = XFS_EXT_NORM; - - xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); - - /* - * Tag the inode if blocks were preallocated. Note that COW fork - * preallocation can occur at the start or end of the extent, even when - * prealloc == 0, so we must also check the aligned offset and length. - */ - if (whichfork == XFS_DATA_FORK && prealloc) - xfs_inode_set_eofblocks_tag(ip); - if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) - xfs_inode_set_cowblocks_tag(ip); - - return 0; - -out_unreserve_frextents: - if (XFS_IS_REALTIME_INODE(ip)) - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); -out_unreserve_quota: - if (XFS_IS_QUOTA_ON(mp)) - xfs_quota_unreserve_blkres(ip, alen); -out: - if (error == -ENOSPC || error == -EDQUOT) { - trace_xfs_delalloc_enospc(ip, off, len); - - if (prealloc || use_cowextszhint) { - /* retry without any preallocation */ - use_cowextszhint = false; - prealloc = 0; - goto retry; - } - } - return error; -} - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) @@ -4948,7 +4668,8 @@ xfs_bmap_del_extent_delay( int whichfork, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del) + struct xfs_bmbt_irec *del, + uint32_t bflags) /* bmapi flags */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -5068,10 +4789,18 @@ xfs_bmap_del_extent_delay( da_diff = da_old - da_new; fdblocks = da_diff; - if (isrt) - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount)); - else + if (bflags & XFS_BMAPI_REMAP) { + ; + } else if (isrt) { + xfs_rtbxlen_t rtxlen; + + rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount); + if (xfs_is_zoned_inode(ip)) + xfs_zoned_add_available(mp, rtxlen); + xfs_add_frextents(mp, rtxlen); + } else { fdblocks += del->br_blockcount; + } xfs_add_fdblocks(mp, fdblocks); xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); @@ -5670,7 +5399,8 @@ __xfs_bunmapi( delete: if (wasdel) { - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, flags); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 4b721d935994..b4d9c6e0f3f9 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extnum_t nexts, int *done); void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del); + struct xfs_bmbt_irec *del, uint32_t bflags); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); @@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_offset); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, - int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, @@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, int fork); int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args); +xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b1007fb661ba..9566a7623365 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -178,9 +178,10 @@ typedef struct xfs_sb { xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ - uint8_t sb_rgblklog; /* rt group number shift */ uint8_t sb_pad[7]; /* zeroes */ + xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */ + xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */ /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -270,9 +271,10 @@ struct xfs_dsb { __be64 sb_metadirino; /* metadata directory tree root */ __be32 sb_rgcount; /* # of realtime groups */ __be32 sb_rgextents; /* size of rtgroup in rtx */ - __u8 sb_rgblklog; /* rt group number shift */ __u8 sb_pad[7]; /* zeroes */ + __be64 sb_rtstart; /* start of internal RT section (FSB) */ + __be64 sb_rtreserved; /* reserved (zoned) RT blocks */ /* * The size of this structure must be padded to 64 bit alignment. @@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ #define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ #define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */ +#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */ +#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */ + #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE | \ XFS_SB_FEAT_INCOMPAT_SPINODES | \ @@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature( XFS_SB_FEAT_INCOMPAT_NREXT64 | \ XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ XFS_SB_FEAT_INCOMPAT_PARENT | \ - XFS_SB_FEAT_INCOMPAT_METADIR) + XFS_SB_FEAT_INCOMPAT_METADIR | \ + XFS_SB_FEAT_INCOMPAT_ZONED | \ + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -952,7 +959,12 @@ struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __be32 di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + __be32 di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + __be32 di_used_blocks; + }; __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 2c3171262b44..12463ba766da 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -189,7 +189,9 @@ struct xfs_fsop_geom { uint32_t checked; /* o: checked fs & rt metadata */ __u32 rgextents; /* rt extents in a realtime group */ __u32 rgcount; /* number of realtime groups */ - __u64 reserved[16]; /* reserved space */ + __u64 rtstart; /* start of internal rt section */ + __u64 rtreserved; /* RT (zoned) reserved blocks */ + __u64 reserved[14]; /* reserved space */ }; #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ @@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ #define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ #define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */ +#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */ /* * Minimum and maximum sizes need for growth checks. @@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry { #define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ +/* + * Devices supported by a single XFS file system. Reported in fsmaps fmr_device + * when using internal RT devices. + */ +enum xfs_device { + XFS_DEV_DATA = 1, + XFS_DEV_LOG = 2, + XFS_DEV_RT = 3, +}; #ifndef HAVE_BBMACROS /* diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h index 242b05627c7a..4423932a2313 100644 --- a/fs/xfs/libxfs/xfs_group.h +++ b/fs/xfs/libxfs/xfs_group.h @@ -19,10 +19,23 @@ struct xfs_group { #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ - /* - * Track freed but not yet committed extents. - */ - struct xfs_extent_busy_tree *xg_busy_extents; + union { + /* + * For perags and non-zoned RT groups: + * Track freed but not yet committed extents. + */ + struct xfs_extent_busy_tree *xg_busy_extents; + + /* + * For zoned RT groups: + * List of groups that need a zone reset. + * + * The zonegc code forces a log flush of the rtrmap inode before + * resetting the write pointer, so there is no need for + * individual busy extent tracking. + */ + struct xfs_group *xg_next_reset; + }; /* * Bitsets of per-ag metadata that have been checked and/or are sick. @@ -107,9 +120,15 @@ xfs_gbno_to_daddr( xfs_agblock_t gbno) { struct xfs_mount *mp = xg->xg_mount; - uint32_t blocks = mp->m_groups[xg->xg_type].blocks; + struct xfs_groups *g = &mp->m_groups[xg->xg_type]; + xfs_fsblock_t fsbno; + + if (g->has_daddr_gaps) + fsbno = xfs_gbno_to_fsb(xg, gbno); + else + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno; - return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno); + return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno); } static inline uint32_t diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index f3a840a425f5..0c47b5c6ca7d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -364,7 +364,7 @@ xfs_ialloc_inode_init( (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED, &fbuf); + 0, &fbuf); if (error) return error; @@ -1927,7 +1927,7 @@ xfs_dialloc( * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ - if (percpu_counter_read_positive(&mp->m_fdblocks) < + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f24fa628fecf..aa13fc00afd7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -137,7 +137,7 @@ xfs_imap_to_bp( int error; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); + imap->im_len, 0, bpp, &xfs_inode_buf_ops); if (xfs_metadata_is_sick(error)) xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), XFS_SICK_AG_INODES); @@ -252,7 +252,10 @@ xfs_inode_from_disk( be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); ip->i_diflags2 = be64_to_cpu(from->di_flags2); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); + BUILD_BUG_ON(sizeof(from->di_cowextsize) != + sizeof(from->di_used_blocks)); } error = xfs_iformat_data_fork(ip, from); @@ -349,6 +352,7 @@ xfs_inode_to_disk( to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); to->di_flags2 = cpu_to_be64(ip->i_diflags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); @@ -752,11 +756,18 @@ xfs_dinode_verify( !xfs_has_rtreflink(mp)) return __this_address; - /* COW extent size hint validation */ - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), - mode, flags, flags2); - if (fa) - return fa; + if (xfs_has_zoned(mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { + if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) + return __this_address; + } else { + /* COW extent size hint validation */ + fa = xfs_inode_validate_cowextsize(mp, + be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) + return fa; + } /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index deb0b7c00a1f..48fe49a5f050 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -322,6 +322,7 @@ xfs_inode_init( if (xfs_has_v3inodes(mp)) { inode_set_iversion(inode, 1); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = 0; times |= XFS_ICHGTIME_CREATE; } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a472ac2e45d0..0d637c276db0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -475,7 +475,12 @@ struct xfs_log_dinode { xfs_lsn_t di_lsn; uint64_t di_flags2; /* more random flags */ - uint32_t di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + uint32_t di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + uint32_t di_used_blocks; + }; uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 2f5f554a36d4..225923e463c4 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -21,6 +21,9 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" static const struct { enum xfs_metafile_type mtype; @@ -74,12 +77,11 @@ xfs_metafile_clear_iflag( } /* - * Is the amount of space that could be allocated towards a given metadata - * file at or beneath a certain threshold? + * Is the metafile reservations at or beneath a certain threshold? */ static inline bool xfs_metafile_resv_can_cover( - struct xfs_inode *ip, + struct xfs_mount *mp, int64_t rhs) { /* @@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover( * global free block count. Take care of the first case to avoid * touching the per-cpu counter. */ - if (ip->i_delayed_blks >= rhs) + if (mp->m_metafile_resv_avail >= rhs) return true; /* * There aren't enough blocks left in the inode's reservation, but it * isn't critical unless there also isn't enough free space. */ - return __percpu_counter_compare(&ip->i_mount->m_fdblocks, - rhs - ip->i_delayed_blks, 2048) >= 0; + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, + rhs - mp->m_metafile_resv_avail, 2048) >= 0; } /* - * Is this metadata file critically low on blocks? For now we'll define that - * as the number of blocks we can get our hands on being less than 10% of what - * we reserved or less than some arbitrary number (maximum btree height). + * Is the metafile reservation critically low on blocks? For now we'll define + * that as the number of blocks we can get our hands on being less than 10% of + * what we reserved or less than some arbitrary number (maximum btree height). */ bool xfs_metafile_resv_critical( - struct xfs_inode *ip) + struct xfs_mount *mp) { - uint64_t asked_low_water; + ASSERT(xfs_has_metadir(mp)); - if (!ip) - return false; - - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_critical(ip, 0); + trace_xfs_metafile_resv_critical(mp, 0); - if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels)) + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) return true; - asked_low_water = div_u64(ip->i_meta_resv_asked, 10); - if (!xfs_metafile_resv_can_cover(ip, asked_low_water)) + if (!xfs_metafile_resv_can_cover(mp, + div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, ip->i_mount, - XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ @@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space( struct xfs_inode *ip, struct xfs_alloc_arg *args) { + struct xfs_mount *mp = ip->i_mount; int64_t len = args->len; ASSERT(xfs_is_metadir_inode(ip)); ASSERT(args->resv == XFS_AG_RESV_METAFILE); - trace_xfs_metafile_resv_alloc_space(ip, args->len); + trace_xfs_metafile_resv_alloc_space(mp, args->len); /* * Allocate the blocks from the metadata inode's block reservation * and update the ondisk sb counter. */ - if (ip->i_delayed_blks > 0) { + mutex_lock(&mp->m_metafile_resv_lock); + if (mp->m_metafile_resv_avail > 0) { int64_t from_resv; - from_resv = min_t(int64_t, len, ip->i_delayed_blks); - ip->i_delayed_blks -= from_resv; + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); + mp->m_metafile_resv_avail -= from_resv; xfs_mod_delalloc(ip, 0, -from_resv); xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -from_resv); @@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space( xfs_trans_mod_sb(args->tp, field, -len); } + mp->m_metafile_resv_used += args->len; + mutex_unlock(&mp->m_metafile_resv_lock); + ip->i_nblocks += args->len; xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); } @@ -186,26 +188,33 @@ xfs_metafile_resv_free_space( struct xfs_trans *tp, xfs_filblks_t len) { + struct xfs_mount *mp = ip->i_mount; int64_t to_resv; ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free_space(ip, len); + + trace_xfs_metafile_resv_free_space(mp, len); ip->i_nblocks -= len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + mutex_lock(&mp->m_metafile_resv_lock); + mp->m_metafile_resv_used -= len; + /* * Add the freed blocks back into the inode's delalloc reservation * until it reaches the maximum size. Update the ondisk fdblocks only. */ - to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); + to_resv = mp->m_metafile_resv_target - + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); if (to_resv > 0) { to_resv = min_t(int64_t, to_resv, len); - ip->i_delayed_blks += to_resv; + mp->m_metafile_resv_avail += to_resv; xfs_mod_delalloc(ip, 0, to_resv); xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); len -= to_resv; } + mutex_unlock(&mp->m_metafile_resv_lock); /* * Everything else goes back to the filesystem, so update the in-core @@ -215,61 +224,99 @@ xfs_metafile_resv_free_space( xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); } -/* Release a metadata file's space reservation. */ +static void +__xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (mp->m_metafile_resv_avail) { + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); + } + mp->m_metafile_resv_avail = 0; + mp->m_metafile_resv_used = 0; + mp->m_metafile_resv_target = 0; +} + +/* Release unused metafile space reservation. */ void xfs_metafile_resv_free( - struct xfs_inode *ip) + struct xfs_mount *mp) { - /* Non-btree metadata inodes don't need space reservations. */ - if (!ip || !ip->i_meta_resv_asked) + if (!xfs_has_metadir(mp)) return; - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free(ip, 0); + trace_xfs_metafile_resv_free(mp, 0); - if (ip->i_delayed_blks) { - xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks); - xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks); - ip->i_delayed_blks = 0; - } - ip->i_meta_resv_asked = 0; + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + mutex_unlock(&mp->m_metafile_resv_lock); } -/* Set up a metadata file's space reservation. */ +/* Set up a metafile space reservation. */ int xfs_metafile_resv_init( - struct xfs_inode *ip, - xfs_filblks_t ask) + struct xfs_mount *mp) { + struct xfs_rtgroup *rtg = NULL; + xfs_filblks_t used = 0, target = 0; xfs_filblks_t hidden_space; - xfs_filblks_t used; - int error; + xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4; + int error = 0; - if (!ip || ip->i_meta_resv_asked > 0) + if (!xfs_has_metadir(mp)) return 0; - ASSERT(xfs_is_metadir_inode(ip)); + /* + * Free any previous reservation to have a clean slate. + */ + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + + /* + * Currently the only btree metafiles that require reservations are the + * rtrmap and the rtrefcount. Anything new will have to be added here + * as well. + */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + if (xfs_has_rtrmapbt(mp)) { + used += rtg_rmap(rtg)->i_nblocks; + target += xfs_rtrmapbt_calc_reserves(mp); + } + if (xfs_has_rtreflink(mp)) { + used += rtg_refcount(rtg)->i_nblocks; + target += xfs_rtrefcountbt_calc_reserves(mp); + } + } + + if (!target) + goto out_unlock; /* - * Space taken by all other metadata btrees are accounted on-disk as + * Space taken by the per-AG metadata btrees are accounted on-disk as * used space. We therefore only hide the space that is reserved but * not used by the trees. */ - used = ip->i_nblocks; - if (used > ask) - ask = used; - hidden_space = ask - used; + if (used > target) + target = used; + else if (target > dblocks_avail) + target = dblocks_avail; + hidden_space = target - used; - error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true); + error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { - trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_); - return error; + trace_xfs_metafile_resv_init_error(mp, 0); + goto out_unlock; } - xfs_mod_delalloc(ip, 0, hidden_space); - ip->i_delayed_blks = hidden_space; - ip->i_meta_resv_asked = ask; + xfs_mod_sb_delalloc(mp, hidden_space); + + mp->m_metafile_resv_target = target; + mp->m_metafile_resv_used = used; + mp->m_metafile_resv_avail = hidden_space; + + trace_xfs_metafile_resv_init(mp, target); - trace_xfs_metafile_resv_init(ip, ask); - return 0; +out_unlock: + mutex_unlock(&mp->m_metafile_resv_lock); + return error; } diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h index 95af4b52e5a7..ae6f9e779b98 100644 --- a/fs/xfs/libxfs/xfs_metafile.h +++ b/fs/xfs/libxfs/xfs_metafile.h @@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); /* Space reservations for metadata inodes. */ struct xfs_alloc_arg; -bool xfs_metafile_resv_critical(struct xfs_inode *ip); +bool xfs_metafile_resv_critical(struct xfs_mount *mp); void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, struct xfs_alloc_arg *args); void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, xfs_filblks_t len); -void xfs_metafile_resv_free(struct xfs_inode *ip); -int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask); +void xfs_metafile_resv_free(struct xfs_mount *mp); +int xfs_metafile_resv_init(struct xfs_mount *mp); /* Code specific to kernel/userspace; must be provided externally. */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index a85ecddaa48e..5ed44fdf7491 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void) 16299260424LL); /* superblock field checks we got from xfs/122 */ - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288); - XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288); + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304); + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304); XFS_CHECK_SB_OFFSET(sb_magicnum, 0); XFS_CHECK_SB_OFFSET(sb_blocksize, 4); XFS_CHECK_SB_OFFSET(sb_dblocks, 8); @@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_SB_OFFSET(sb_rgextents, 276); XFS_CHECK_SB_OFFSET(sb_rgblklog, 280); XFS_CHECK_SB_OFFSET(sb_pad, 281); + XFS_CHECK_SB_OFFSET(sb_rtstart, 288); + XFS_CHECK_SB_OFFSET(sb_rtreserved, 296); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 770adf60dd73..5057536e586c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1123,6 +1123,7 @@ xfs_rtfree_blocks( xfs_extlen_t mod; int error; + ASSERT(!xfs_has_zoned(mp)); ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); mod = xfs_blen_to_rtxoff(mp, rtlen); @@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range( end = min(end, rtg->rtg_extents - 1); + if (xfs_has_zoned(mp)) + return -EINVAL; + /* Iterate the bitmap, looking for discrepancies. */ while (start <= end) { struct xfs_rtalloc_rec rec; @@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len( struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { + if (xfs_has_zoned(mp)) + return 0; return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp)); } @@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount( xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp); unsigned long long rsumwords; + if (xfs_has_zoned(mp)) { + *rsumlevels = 0; + return 0; + } + *rsumlevels = xfs_compute_rextslog(rextents) + 1; rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels); return howmany_64(rsumwords, mp->m_blockwsize); diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c index d84d32f1b48f..9186c58e83d5 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.c +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -194,15 +194,17 @@ xfs_rtgroup_lock( ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || !(rtglock_flags & XFS_RTGLOCK_BITMAP)); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { - /* - * Lock both realtime free space metadata inodes for a freespace - * update. - */ - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); - xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + /* + * Lock both realtime free space metadata inodes for a + * freespace update. + */ + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } } if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) @@ -228,11 +230,13 @@ xfs_rtgroup_unlock( if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { - xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } } } @@ -249,7 +253,8 @@ xfs_rtgroup_trans_join( ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + if (!xfs_has_zoned(rtg_mount(rtg)) && + (rtglock_flags & XFS_RTGLOCK_BITMAP)) { xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL); } @@ -270,7 +275,7 @@ xfs_rtgroup_get_geometry( /* Fill out form. */ memset(rgeo, 0, sizeof(*rgeo)); rgeo->rg_number = rtg_rgno(rtg); - rgeo->rg_length = rtg_group(rtg)->xg_block_count; + rgeo->rg_length = rtg_blocks(rtg); xfs_rtgroup_geom_health(rtg, rgeo); return 0; } @@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { .sick = XFS_SICK_RG_BITMAP, .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, .create = xfs_rtbitmap_create, }, [XFS_RTGI_SUMMARY] = { @@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { .sick = XFS_SICK_RG_SUMMARY, .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, .create = xfs_rtsummary_create, }, [XFS_RTGI_RMAP] = { diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index 03f39d4e43fc..d36a6ae0abe5 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -37,15 +37,33 @@ struct xfs_rtgroup { xfs_rtxnum_t rtg_extents; /* - * Cache of rt summary level per bitmap block with the invariant that - * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0, - * or 0 if rsum[i][bbno] == 0 for all i. - * + * For bitmap based RT devices this points to a cache of rt summary + * level per bitmap block with the invariant that rtg_rsum_cache[bbno] + * > the maximum i for which rsum[i][bbno] != 0, or 0 if + * rsum[i][bbno] == 0 for all i. * Reads and writes are serialized by the rsumip inode lock. + * + * For zoned RT devices this points to the open zone structure for + * a group that is open for writers, or is NULL. */ - uint8_t *rtg_rsum_cache; + union { + uint8_t *rtg_rsum_cache; + struct xfs_open_zone *rtg_open_zone; + }; }; +/* + * For zoned RT devices this is set on groups that have no written blocks + * and can be picked by the allocator for opening. + */ +#define XFS_RTG_FREE XA_MARK_0 + +/* + * For zoned RT devices this is set on groups that are fully written and that + * have unused blocks. Used by the garbage collection to pick targets. + */ +#define XFS_RTG_RECLAIMABLE XA_MARK_1 + static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) { return container_of(xg, struct xfs_rtgroup, rtg_group); @@ -66,6 +84,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) return rtg->rtg_group.xg_gno; } +static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_block_count; +} + static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg) { return rtg->rtg_inodes[XFS_RTGI_BITMAP]; @@ -222,10 +245,14 @@ xfs_rtb_to_daddr( xfs_rtblock_t rtbno) { struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; - xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); - uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks; - return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask)); + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + + rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask); + } + + return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno); } static inline xfs_rtblock_t @@ -233,10 +260,11 @@ xfs_daddr_to_rtb( struct xfs_mount *mp, xfs_daddr_t daddr) { - xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + xfs_rfsblock_t bno; - if (xfs_has_rtgroups(mp)) { - struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb; + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { xfs_rgnumber_t rgno; uint32_t rgbno; diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c index e4ec36943cb7..9bdc2cbfc113 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.c +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -1033,3 +1033,22 @@ xfs_rtrmapbt_init_rtsb( xfs_btree_del_cursor(cur, error); return error; } + +/* + * Return the highest rgbno currently tracked by the rmap for this rtg. + */ +xfs_rgblock_t +xfs_rtrmap_highest_rgbno( + struct xfs_rtgroup *rtg) +{ + struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot; + union xfs_btree_key key = {}; + struct xfs_btree_cur *cur; + + if (block->bb_numrecs == 0) + return NULLRGBLOCK; + cur = xfs_rtrmapbt_init_cursor(NULL, rtg); + xfs_btree_get_keys(cur, block, &key); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock); +} diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h index 9d0915089891..e328fd62a149 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.h +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h @@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg, int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, struct xfs_buftarg *btp, xfs_rgnumber_t rgno); +xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg); + #endif /* __XFS_RTRMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 3dc5f5dba162..711e180f9ebb 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -185,6 +185,8 @@ xfs_sb_version_to_features( features |= XFS_FEAT_PARENT; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) features |= XFS_FEAT_METADIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) + features |= XFS_FEAT_ZONED; return features; } @@ -266,6 +268,9 @@ static uint64_t xfs_expected_rbmblocks( struct xfs_sb *sbp) { + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) + return 0; return howmany_64(xfs_extents_per_rbm(sbp), NBBY * xfs_rtbmblock_size(sbp)); } @@ -275,9 +280,15 @@ bool xfs_validate_rt_geometry( struct xfs_sb *sbp) { - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) - return false; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { + if (sbp->sb_rextsize != 1) + return false; + } else { + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) + return false; + } if (sbp->sb_rblocks == 0) { if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || @@ -435,6 +446,34 @@ xfs_validate_sb_rtgroups( return 0; } +static int +xfs_validate_sb_zoned( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + if (sbp->sb_frextents != 0) { + xfs_warn(mp, +"sb_frextents must be zero for zoned file systems."); + return -EINVAL; + } + + if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) { + xfs_warn(mp, +"sb_rtstart (%lld) overlaps sb_dblocks (%lld).", + sbp->sb_rtstart, sbp->sb_dblocks); + return -EINVAL; + } + + if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) { + xfs_warn(mp, +"sb_rtreserved (%lld) larger than sb_rblocks (%lld).", + sbp->sb_rtreserved, sbp->sb_rblocks); + return -EINVAL; + } + + return 0; +} + /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( @@ -523,6 +562,11 @@ xfs_validate_sb_common( if (error) return error; } + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + error = xfs_validate_sb_zoned(mp, sbp); + if (error) + return error; + } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, @@ -835,6 +879,14 @@ __xfs_sb_from_disk( to->sb_rgcount = 1; to->sb_rgextents = 0; } + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = be64_to_cpu(from->sb_rtstart); + to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved); + } else { + to->sb_rtstart = 0; + to->sb_rtreserved = 0; + } } void @@ -1001,6 +1053,11 @@ xfs_sb_to_disk( to->sb_rbmino = cpu_to_be64(0); to->sb_rsumino = cpu_to_be64(0); } + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = cpu_to_be64(from->sb_rtstart); + to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved); + } } /* @@ -1146,6 +1203,10 @@ xfs_sb_mount_rextsize( rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; rgs->blklog = mp->m_sb.sb_rgblklog; rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); + rgs->start_fsb = mp->m_sb.sb_rtstart; + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)) + rgs->has_daddr_gaps = true; } else { rgs->blocks = 0; rgs->blklog = 0; @@ -1265,8 +1326,7 @@ xfs_log_sb( mp->m_sb.sb_ifree = min_t(uint64_t, percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = - percpu_counter_sum_positive(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS); } /* @@ -1275,9 +1335,10 @@ xfs_log_sb( * we handle nearly-lockless reservations, so we must use the _positive * variant here to avoid writing out nonsense frextents. */ - if (xfs_has_rtgroups(mp)) + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { mp->m_sb.sb_frextents = - percpu_counter_sum_positive(&mp->m_frextents); + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS); + } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); @@ -1510,6 +1571,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; if (xfs_has_metadir(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; + if (xfs_has_zoned(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); @@ -1530,6 +1593,10 @@ xfs_fs_geometry( geo->rgcount = sbp->sb_rgcount; geo->rgextents = sbp->sb_rgextents; } + if (xfs_has_zoned(mp)) { + geo->rtstart = sbp->sb_rtstart; + geo->rtreserved = sbp->sb_rtreserved; + } } /* Read a secondary superblock. */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index ca2401c1facd..f6f4f2d4b5db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -233,6 +233,34 @@ enum xfs_group_type { { XG_TYPE_AG, "ag" }, \ { XG_TYPE_RTG, "rtg" } +enum xfs_free_counter { + /* + * Number of free blocks on the data device. + */ + XC_FREE_BLOCKS, + + /* + * Number of free RT extents on the RT device. + */ + XC_FREE_RTEXTENTS, + + /* + * Number of available for use RT extents. + * + * This counter only exists for zoned RT device and indicates the number + * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS + * also includes blocks that have been written previously and freed, but + * sit in a rtgroup that still needs a zone reset. + */ + XC_FREE_RTAVAILABLE, + XC_FREE_NR, +}; + +#define XFS_FREECOUNTER_STR \ + { XC_FREE_BLOCKS, "blocks" }, \ + { XC_FREE_RTEXTENTS, "rtextents" }, \ + { XC_FREE_RTAVAILABLE, "rtavailable" } + /* * Type verifier functions */ diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c new file mode 100644 index 000000000000..b0791a71931c --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zones.h" + +static bool +xfs_zone_validate_empty( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > 0) { + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = 0; + return true; +} + +static bool +xfs_zone_validate_wp( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) { + xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.", + rtg_rgno(rtg), wp_fsb); + return false; + } + + *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb); + if (*write_pointer >= rtg->rtg_extents) { + xfs_warn(mp, "zone %u has invalid write pointer (0x%x).", + rtg_rgno(rtg), *write_pointer); + return false; + } + + return true; +} + +static bool +xfs_zone_validate_full( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = rtg->rtg_extents; + return true; +} + +static bool +xfs_zone_validate_seq( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + return xfs_zone_validate_empty(zone, rtg, write_pointer); + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return xfs_zone_validate_wp(zone, rtg, write_pointer); + case BLK_ZONE_COND_FULL: + return xfs_zone_validate_full(zone, rtg, write_pointer); + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + default: + xfs_warn(mp, "zone %u has unknown zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +static bool +xfs_zone_validate_conv( + struct blk_zone *zone, + struct xfs_rtgroup *rtg) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + return true; + default: + xfs_warn(mp, +"conventional zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +bool +xfs_zone_validate( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + uint32_t expected_size; + + /* + * Check that the zone capacity matches the rtgroup size stored in the + * superblock. Note that all zones including the last one must have a + * uniform capacity. + */ + if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) { + xfs_warn(mp, +"zone %u capacity (0x%llx) does not match RT group size (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity), + g->blocks); + return false; + } + + if (g->has_daddr_gaps) { + expected_size = 1 << g->blklog; + } else { + if (zone->len != zone->capacity) { + xfs_warn(mp, +"zone %u has capacity != size ((0x%llx vs 0x%llx)", + rtg_rgno(rtg), + XFS_BB_TO_FSB(mp, zone->len), + XFS_BB_TO_FSB(mp, zone->capacity)); + return false; + } + expected_size = g->blocks; + } + + if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) { + xfs_warn(mp, +"zone %u length (0x%llx) does match geometry (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len), + expected_size); + } + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + return xfs_zone_validate_conv(zone, rtg); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + return xfs_zone_validate_seq(zone, rtg, write_pointer); + default: + xfs_warn(mp, "zoned %u has unsupported type 0x%x.", + rtg_rgno(rtg), zone->type); + return false; + } +} diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h new file mode 100644 index 000000000000..c4f1367b2cca --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIBXFS_ZONES_H +#define _LIBXFS_ZONES_H + +struct xfs_rtgroup; + +/* + * In order to guarantee forward progress for GC we need to reserve at least + * two zones: one that will be used for moving data into and one spare zone + * making sure that we have enough space to relocate a nearly-full zone. + * To allow for slightly sloppy accounting for when we need to reserve the + * second zone, we actually reserve three as that is easier than doing fully + * accurate bookkeeping. + */ +#define XFS_GC_ZONES 3U + +/* + * In addition we need two zones for user writes, one open zone for writing + * and one to still have available blocks without resetting the open zone + * when data in the open zone has been freed. + */ +#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1) +#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1) + +/* + * Always keep one zone out of the general open zone pool to allow for GC to + * happen while other writers are waiting for free space. + */ +#define XFS_OPEN_GC_ZONES 1U +#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) + +bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer); + +#endif /* _LIBXFS_ZONES_H */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 9f8c312dfd3c..303374df44bd 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -69,6 +69,8 @@ STATIC size_t xchk_superblock_ondisk_size( struct xfs_mount *mp) { + if (xfs_has_zoned(mp)) + return offsetofend(struct xfs_dsb, sb_rtreserved); if (xfs_has_metadir(mp)) return offsetofend(struct xfs_dsb, sb_pad); if (xfs_has_metauuid(mp)) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 66da7d4d56ba..4f1e2574660d 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -1038,8 +1038,8 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* No CoW forks on non-reflink filesystems. */ - if (!xfs_has_reflink(mp)) { + /* No CoW forks filesystem doesn't support out of place writes */ + if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); return 0; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index ca23cf4db6c5..e629663e460a 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -350,7 +350,7 @@ retry: * The global incore space reservation is taken from the incore * counters, so leave that out of the computation. */ - fsc->fdblocks -= mp->m_resblks_avail; + fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail; /* * Delayed allocation reservations are taken out of the incore counters @@ -413,7 +413,13 @@ xchk_fscount_count_frextents( fsc->frextents = 0; fsc->frextents_delayed = 0; - if (!xfs_has_realtime(mp)) + + /* + * Don't bother verifying and repairing the fs counters for zoned file + * systems as they don't track an on-disk frextents count, and the + * in-memory percpu counter also includes reservations. + */ + if (!xfs_has_realtime(mp) || xfs_has_zoned(mp)) return 0; while ((rtg = xfs_rtgroup_next(mp, rtg))) { @@ -513,8 +519,8 @@ xchk_fscounters( /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); - fdblocks = percpu_counter_sum(&mp->m_fdblocks); - frextents = percpu_counter_sum(&mp->m_frextents); + fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS); /* No negative values, please! */ if (icount < 0 || ifree < 0) @@ -589,15 +595,17 @@ xchk_fscounters( try_again = true; } - if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) { + if (!xchk_fscount_within_range(sc, fdblocks, + &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) { if (fsc->frozen) xchk_set_corrupt(sc); else try_again = true; } - if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + if (!xfs_has_zoned(mp) && + !xchk_fscount_within_range(sc, frextents, + &mp->m_free[XC_FREE_RTEXTENTS].count, fsc->frextents - fsc->frextents_delayed)) { if (fsc->frozen) xchk_set_corrupt(sc); diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c index cda13447a373..f0d2b04644e4 100644 --- a/fs/xfs/scrub/fscounters_repair.c +++ b/fs/xfs/scrub/fscounters_repair.c @@ -64,7 +64,7 @@ xrep_fscounters( percpu_counter_set(&mp->m_icount, fsc->icount); percpu_counter_set(&mp->m_ifree, fsc->ifree); - percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks); /* * Online repair is only supported on v5 file systems, which require @@ -74,10 +74,12 @@ xrep_fscounters( * track of the delalloc reservations separately, as they are are * subtracted from m_frextents, but not included in sb_frextents. */ - percpu_counter_set(&mp->m_frextents, - fsc->frextents - fsc->frextents_delayed); - if (!xfs_has_rtgroups(mp)) - mp->m_sb.sb_frextents = fsc->frextents; + if (!xfs_has_zoned(mp)) { + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + fsc->frextents - fsc->frextents_delayed); + if (!xfs_has_rtgroups(mp)) + mp->m_sb.sb_frextents = fsc->frextents; + } return 0; } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index db6edd5a5fe5..bb3f475b6353 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -273,6 +273,13 @@ xchk_inode_cowextsize( xfs_failaddr_t fa; uint32_t value = be32_to_cpu(dip->di_cowextsize); + /* + * The used block counter for rtrmap is checked and repaired elsewhere. + */ + if (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) + return; + fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2); if (fa) xchk_ino_set_corrupt(sc, ino); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 13ff1c933cb8..a90a011c7e5f 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -710,7 +710,9 @@ xrep_dinode_extsize_hints( XFS_DIFLAG_EXTSZINHERIT); } - if (dip->di_version < 3) + if (dip->di_version < 3 || + (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))) return; fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), @@ -1558,8 +1560,7 @@ xrep_dinode_core( /* Read the inode cluster buffer. */ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, - ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, - NULL); + ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL); if (error) return error; diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index ac38f5843090..1588ce971cb8 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -62,7 +62,7 @@ xrep_newbt_estimate_slack( free = sc->sa.pag->pagf_freeblks; sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); } else { - free = percpu_counter_sum(&sc->mp->m_fdblocks); + free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS); sz = sc->mp->m_sb.sb_dblocks; } diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index c287c755f2c5..3537f3cca6d5 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -167,10 +167,11 @@ xrep_orphanage_create( * directory to control access to a file we put in here. */ if (d_really_is_negative(orphanage_dentry)) { - error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry, - 0750); - if (error) - goto out_dput_orphanage; + orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, + orphanage_dentry, 0750); + error = PTR_ERR(orphanage_dentry); + if (IS_ERR(orphanage_dentry)) + goto out_unlock_root; } /* Not a directory? Bail out. */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index b32fb233cf84..8703897c0a9c 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks( if (error) return error; - if (xreap_dirty(&rs)) - return xrep_defer_finish(sc); + if (xreap_dirty(&rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + } - return 0; + return xrep_reset_metafile_resv(sc); } /* diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 3b5288d3ef4e..f8f9ed30f56b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -43,6 +43,7 @@ #include "xfs_rtalloc.h" #include "xfs_metafile.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse( xfs_rtxnum_t startrtx; xfs_rtxnum_t endrtx; bool is_free = false; - int error; + int error = 0; + + if (xfs_has_zoned(mp)) { + if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1)) + return -EFSCORRUPTED; + return 0; + } startrtx = xfs_rgbno_to_rtx(mp, rgbno); endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1); @@ -1386,11 +1393,12 @@ int xrep_reset_metafile_resv( struct xfs_scrub *sc) { - struct xfs_inode *ip = sc->ip; + struct xfs_mount *mp = sc->mp; int64_t delta; int error; - delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked; + delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail - + mp->m_metafile_resv_target; if (delta == 0) return 0; @@ -1401,11 +1409,11 @@ xrep_reset_metafile_resv( if (delta > 0) { int64_t give_back; - give_back = min_t(uint64_t, delta, ip->i_delayed_blks); + give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail); if (give_back > 0) { - xfs_mod_delalloc(ip, 0, -give_back); - xfs_add_fdblocks(ip->i_mount, give_back); - ip->i_delayed_blks -= give_back; + xfs_mod_sb_delalloc(mp, -give_back); + xfs_add_fdblocks(mp, give_back); + mp->m_metafile_resv_avail -= give_back; } return 0; @@ -1413,24 +1421,23 @@ xrep_reset_metafile_resv( /* * Not enough reservation; try to take some blocks from the filesystem - * to the metadata inode. @delta is negative here, so invert the sign. + * to the metabtree reservation. */ - delta = -delta; - error = xfs_dec_fdblocks(sc->mp, delta, true); + delta = -delta; /* delta is negative here, so invert the sign. */ + error = xfs_dec_fdblocks(mp, delta, true); while (error == -ENOSPC) { delta--; if (delta == 0) { xfs_warn(sc->mp, -"Insufficient free space to reset space reservation for inode 0x%llx after repair.", - ip->i_ino); +"Insufficient free space to reset metabtree reservation after repair."); return 0; } - error = xfs_dec_fdblocks(sc->mp, delta, true); + error = xfs_dec_fdblocks(mp, delta, true); } if (error) return error; - xfs_mod_delalloc(ip, 0, delta); - ip->i_delayed_blks += delta; + xfs_mod_sb_delalloc(mp, delta); + mp->m_metafile_resv_avail += delta; return 0; } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index e8c776a34c1d..d5ff8609dbfb 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -21,6 +21,7 @@ #include "xfs_rmap.h" #include "xfs_rtrmap_btree.h" #include "xfs_exchmaps.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" @@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space( xfs_extlen_t len) { struct xfs_rtgroup *rtg = sc->sr.rtg; - struct xfs_inode *rbmip = rtg_bitmap(rtg); xfs_rtxnum_t startext; xfs_rtxnum_t endext; bool is_free; @@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space( if (xchk_skip_xref(sc->sm)) return; + if (xfs_has_zoned(sc->mp)) { + if (!xfs_zone_rgbno_is_valid(rtg, + xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1)) + xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino); + return; + } + startext = xfs_rtb_to_rtx(sc->mp, rtbno); endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext, @@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space( if (!xchk_should_check_xref(sc, &error, NULL)) return; if (is_free) - xchk_ino_xref_set_corrupt(sc, rbmip->i_ino); + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); } diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c index 257cfb24beb4..983362447826 100644 --- a/fs/xfs/scrub/rtrefcount_repair.c +++ b/fs/xfs/scrub/rtrefcount_repair.c @@ -697,32 +697,6 @@ err_cur: return error; } -/* - * Now that we've logged the roots of the new btrees, invalidate all of the - * old blocks and free them. - */ -STATIC int -xrep_rtrefc_remove_old_tree( - struct xrep_rtrefc *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrefcountbt - * and aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, - &rr->old_rtrefcountbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrefcount inode so that we - * don't fail to expand the btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - /* Rebuild the rt refcount btree. */ int xrep_rtrefcountbt( @@ -769,8 +743,12 @@ xrep_rtrefcountbt( if (error) goto out_bitmap; - /* Kill the old tree. */ - error = xrep_rtrefc_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrefcountbt + * and aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, + &rr->old_rtrefcountbt_blocks); if (error) goto out_bitmap; diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c index f2fdd7a9fc24..fc2592c53af5 100644 --- a/fs/xfs/scrub/rtrmap_repair.c +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -810,28 +810,6 @@ err_cur: /* Reaping the old btree. */ -/* Reap the old rtrmapbt blocks. */ -STATIC int -xrep_rtrmap_remove_old_tree( - struct xrep_rtrmap *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrmapbt and - * aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrmap inode so that we don't - * fail to expand the new btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - static inline bool xrep_rtrmapbt_want_live_update( struct xchk_iscan *iscan, @@ -995,8 +973,11 @@ xrep_rtrmapbt( if (error) goto out_records; - /* Kill the old tree. */ - error = xrep_rtrmap_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrmapbt and + * aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); if (error) goto out_records; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 6fa9e3e5bab7..9908850bf76f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -399,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, .repair = xrep_rtsummary, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6d9965b546cb..26a04a783489 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. * All Rights Reserved. */ #include "xfs.h" @@ -20,6 +20,8 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_icache.h" +#include "xfs_zone_alloc.h" +#include "xfs_rtgroup.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -77,6 +79,26 @@ xfs_setfilesize( return xfs_trans_commit(tp); } +static void +xfs_ioend_put_open_zones( + struct iomap_ioend *ioend) +{ + struct iomap_ioend *tmp; + + /* + * Put the open zone for all ioends merged into this one (if any). + */ + list_for_each_entry(tmp, &ioend->io_list, io_list) + xfs_open_zone_put(tmp->io_private); + + /* + * The main ioend might not have an open zone if the submission failed + * before xfs_zone_alloc_and_submit got called. + */ + if (ioend->io_private) + xfs_open_zone_put(ioend->io_private); +} + /* * IO write completion. */ @@ -86,6 +108,7 @@ xfs_end_ioend( { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_mount *mp = ip->i_mount; + bool is_zoned = xfs_is_zoned_inode(ip); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; unsigned int nofs_flag; @@ -115,10 +138,11 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) { + if (ioend->io_flags & IOMAP_IOEND_SHARED) { + ASSERT(!is_zoned); xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, - offset + size); + offset + size, NULL); } goto done; } @@ -126,14 +150,21 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_F_SHARED) + if (is_zoned) + error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector, + ioend->io_private, NULLFSBLOCK); + else if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_type == IOMAP_UNWRITTEN) + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - if (!error && xfs_ioend_is_append(ioend)) + if (!error && + !(ioend->io_flags & IOMAP_IOEND_DIRECT) && + xfs_ioend_is_append(ioend)) error = xfs_setfilesize(ip, offset, size); done: + if (is_zoned) + xfs_ioend_put_open_zones(ioend); iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } @@ -176,17 +207,27 @@ xfs_end_io( } } -STATIC void +void xfs_end_bio( struct bio *bio) { struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; unsigned long flags; + /* + * For Appends record the actually written block number and set the + * boundary flag if needed. + */ + if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) { + ioend->io_sector = bio->bi_iter.bi_sector; + xfs_mark_rtg_boundary(ioend); + } + spin_lock_irqsave(&ip->i_ioend_lock, flags); if (list_empty(&ip->i_ioend_list)) - WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, + WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, &ip->i_ioend_work)); list_add_tail(&ioend->io_list, &ip->i_ioend_list); spin_unlock_irqrestore(&ip->i_ioend_lock, flags); @@ -396,10 +437,11 @@ allocate_blocks: } static int -xfs_prepare_ioend( - struct iomap_ioend *ioend, +xfs_submit_ioend( + struct iomap_writepage_ctx *wpc, int status) { + struct iomap_ioend *ioend = wpc->ioend; unsigned int nofs_flag; /* @@ -410,7 +452,7 @@ xfs_prepare_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } @@ -418,10 +460,14 @@ xfs_prepare_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED)) + if (xfs_ioend_is_append(ioend) || + (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) ioend->io_bio.bi_end_io = xfs_end_bio; - return status; + + if (status) + return status; + submit_bio(&ioend->io_bio); + return 0; } /* @@ -458,12 +504,107 @@ xfs_discard_folio( * folio itself and not the start offset that is passed in. */ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + folio_size(folio), NULL); } static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, - .prepare_ioend = xfs_prepare_ioend, + .submit_ioend = xfs_submit_ioend, + .discard_folio = xfs_discard_folio, +}; + +struct xfs_zoned_writepage_ctx { + struct iomap_writepage_ctx ctx; + struct xfs_open_zone *open_zone; +}; + +static inline struct xfs_zoned_writepage_ctx * +XFS_ZWPC(struct iomap_writepage_ctx *ctx) +{ + return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx); +} + +static int +xfs_zoned_map_blocks( + struct iomap_writepage_ctx *wpc, + struct inode *inode, + loff_t offset, + unsigned int len) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len); + xfs_filblks_t count_fsb; + struct xfs_bmbt_irec imap, del; + struct xfs_iext_cursor icur; + + if (xfs_is_shutdown(mp)) + return -EIO; + + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); + + /* + * All dirty data must be covered by delalloc extents. But truncate can + * remove delalloc extents underneath us or reduce their size. + * Returning a hole tells iomap to not write back any data from this + * range, which is the right thing to do in that case. + * + * Otherwise just tell iomap to treat ranges previously covered by a + * delalloc extent as mapped. The actual block allocation will be done + * just before submitting the bio. + * + * This implies we never map outside folios that are locked or marked + * as under writeback, and thus there is no need check the fork sequence + * count here. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) + imap.br_startoff = end_fsb; /* fake a hole past EOF */ + if (imap.br_startoff > offset_fsb) { + imap.br_blockcount = imap.br_startoff - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0); + return 0; + } + end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); + count_fsb = end_fsb - offset_fsb; + + del = imap; + xfs_trim_extent(&del, offset_fsb, count_fsb); + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del, + XFS_BMAPI_REMAP); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + wpc->iomap.type = IOMAP_MAPPED; + wpc->iomap.flags = IOMAP_F_DIRTY; + wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev; + wpc->iomap.offset = offset; + wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb); + wpc->iomap.flags = IOMAP_F_ANON_WRITE; + + trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length); + return 0; +} + +static int +xfs_zoned_submit_ioend( + struct iomap_writepage_ctx *wpc, + int status) +{ + wpc->ioend->io_bio.bi_end_io = xfs_end_bio; + if (status) + return status; + xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone); + return 0; +} + +static const struct iomap_writeback_ops xfs_zoned_writeback_ops = { + .map_blocks = xfs_zoned_map_blocks, + .submit_ioend = xfs_zoned_submit_ioend, .discard_folio = xfs_discard_folio, }; @@ -472,10 +613,25 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { }; + struct xfs_inode *ip = XFS_I(mapping->host); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); + if (xfs_is_zoned_inode(ip)) { + struct xfs_zoned_writepage_ctx xc = { }; + int error; + + error = iomap_writepages(mapping, wbc, &xc.ctx, + &xfs_zoned_writeback_ops); + if (xc.open_zone) + xfs_open_zone_put(xc.open_zone); + return error; + } else { + struct xfs_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc.ctx, + &xfs_writeback_ops); + } } STATIC int diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index e0bd68419764..5a7a0f1a0b49 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -9,6 +9,7 @@ extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_dax_aops; -int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +void xfs_end_bio(struct bio *bio); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0836fea2d6d8..06ca11731e43 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -30,6 +30,7 @@ #include "xfs_reflink.h" #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" /* Kernel only BMAP related definitions and functions */ @@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, - xfs_off_t end_byte) + xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range( continue; } - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + if (xfs_is_zoned_inode(ip) && ac) { + /* + * In a zoned buffered write context we need to return + * the punched delalloc allocations to the allocation + * context. This allows reusing them in the following + * iomap iterations. + */ + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, XFS_BMAPI_REMAP); + ac->reserved_blocks += del.br_blockcount; + } else { + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, 0); + } + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } @@ -582,7 +598,7 @@ xfs_free_eofblocks( if (ip->i_delayed_blks) { xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), - LLONG_MAX); + LLONG_MAX, NULL); } xfs_inode_clear_eofblocks_tag(ip); return 0; @@ -825,7 +841,8 @@ int xfs_free_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; @@ -880,7 +897,7 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = xfs_zero_range(ip, offset, len, NULL); + error = xfs_zero_range(ip, offset, len, ac, NULL); if (error) return error; @@ -968,7 +985,8 @@ int xfs_collapse_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -981,7 +999,7 @@ xfs_collapse_file_space( trace_xfs_collapse_file_space(ip); - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index b29760d36e1a..c477b3361630 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -15,6 +15,7 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_bmalloca; +struct xfs_zone_alloc_ctx; #ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); @@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, - xfs_off_t start_byte, xfs_off_t end_byte); + xfs_off_t start_byte, xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ @@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 5d560e9073f4..8e7f1b324b3b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -55,27 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) return bp->b_rhash_key == XFS_BUF_DADDR_NULL; } -static inline int -xfs_buf_is_vmapped( - struct xfs_buf *bp) -{ - /* - * Return true if the buffer is vmapped. - * - * b_addr is null if the buffer is not mapped, but the code is clever - * enough to know it doesn't have to map a single page, so the check has - * to be both for b_addr and bp->b_page_count > 1. - */ - return bp->b_addr && bp->b_page_count > 1; -} - -static inline int -xfs_buf_vmap_len( - struct xfs_buf *bp) -{ - return (bp->b_page_count * PAGE_SIZE); -} - /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer @@ -109,38 +88,168 @@ xfs_buf_stale( spin_unlock(&bp->b_lock); } +static void +xfs_buf_free_callback( + struct callback_head *cb) +{ + struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); + + if (bp->b_maps != &bp->__b_map) + kfree(bp->b_maps); + kmem_cache_free(xfs_buf_cache, bp); +} + +static void +xfs_buf_free( + struct xfs_buf *bp) +{ + unsigned int size = BBTOB(bp->b_length); + + trace_xfs_buf_free(bp, _RET_IP_); + + ASSERT(list_empty(&bp->b_lru)); + + if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) + mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); + + if (is_vmalloc_addr(bp->b_addr)) + vfree(bp->b_addr); + else if (bp->b_flags & _XBF_KMEM) + kfree(bp->b_addr); + else + folio_put(virt_to_folio(bp->b_addr)); + + call_rcu(&bp->b_rcu, xfs_buf_free_callback); +} + static int -xfs_buf_get_maps( +xfs_buf_alloc_kmem( struct xfs_buf *bp, - int map_count) + size_t size, + gfp_t gfp_mask) { - ASSERT(bp->b_maps == NULL); - bp->b_map_count = map_count; + ASSERT(is_power_of_2(size)); + ASSERT(size < PAGE_SIZE); - if (map_count == 1) { - bp->b_maps = &bp->__b_map; - return 0; - } + bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); + if (!bp->b_addr) + return -ENOMEM; - bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); - if (!bp->b_maps) + /* + * Slab guarantees that we get back naturally aligned allocations for + * power of two sizes. Keep this check as the canary in the coal mine + * if anything changes in slab. + */ + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { + kfree(bp->b_addr); + bp->b_addr = NULL; return -ENOMEM; + } + bp->b_flags |= _XBF_KMEM; + trace_xfs_buf_backing_kmem(bp, _RET_IP_); return 0; } -static void -xfs_buf_free_maps( - struct xfs_buf *bp) +/* + * Allocate backing memory for a buffer. + * + * For tmpfs-backed buffers used by in-memory btrees this directly maps the + * tmpfs page cache folios. + * + * For real file system buffers there are three different kinds backing memory: + * + * The first type backs the buffer by a kmalloc allocation. This is done for + * less than PAGE_SIZE allocations to avoid wasting memory. + * + * The second type is a single folio buffer - this may be a high order folio or + * just a single page sized folio, but either way they get treated the same way + * by the rest of the code - the buffer memory spans a single contiguous memory + * region that we don't have to map and unmap to access the data directly. + * + * The third type of buffer is the vmalloc()d buffer. This provides the buffer + * with the required contiguous memory region but backed by discontiguous + * physical pages. + */ +static int +xfs_buf_alloc_backing_mem( + struct xfs_buf *bp, + xfs_buf_flags_t flags) { - if (bp->b_maps != &bp->__b_map) { - kfree(bp->b_maps); - bp->b_maps = NULL; + size_t size = BBTOB(bp->b_length); + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; + struct folio *folio; + + if (xfs_buftarg_is_mem(bp->b_target)) + return xmbuf_map_backing_mem(bp); + + /* Assure zeroed buffer for non-read cases. */ + if (!(flags & XBF_READ)) + gfp_mask |= __GFP_ZERO; + + if (flags & XBF_READ_AHEAD) + gfp_mask |= __GFP_NORETRY; + + /* + * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that + * is properly aligned. The slab allocator now guarantees an aligned + * allocation for all power of two sizes, which matches most of the + * smaller than PAGE_SIZE buffers used by XFS. + */ + if (size < PAGE_SIZE && is_power_of_2(size)) + return xfs_buf_alloc_kmem(bp, size, gfp_mask); + + /* + * Don't bother with the retry loop for single PAGE allocations: vmalloc + * won't do any better. + */ + if (size <= PAGE_SIZE) + gfp_mask |= __GFP_NOFAIL; + + /* + * Optimistically attempt a single high order folio allocation for + * larger than PAGE_SIZE buffers. + * + * Allocating a high order folio makes the assumption that buffers are a + * power-of-2 size, matching the power-of-2 folios sizes available. + * + * The exception here are user xattr data buffers, which can be arbitrarily + * sized up to 64kB plus structure metadata, skip straight to the vmalloc + * path for them instead of wasting memory here. + */ + if (size > PAGE_SIZE) { + if (!is_power_of_2(size)) + goto fallback; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + gfp_mask |= __GFP_NORETRY; } + folio = folio_alloc(gfp_mask, get_order(size)); + if (!folio) { + if (size <= PAGE_SIZE) + return -ENOMEM; + trace_xfs_buf_backing_fallback(bp, _RET_IP_); + goto fallback; + } + bp->b_addr = folio_address(folio); + trace_xfs_buf_backing_folio(bp, _RET_IP_); + return 0; + +fallback: + for (;;) { + bp->b_addr = __vmalloc(size, gfp_mask); + if (bp->b_addr) + break; + if (flags & XBF_READ_AHEAD) + return -ENOMEM; + XFS_STATS_INC(bp->b_mount, xb_page_retries); + memalloc_retry_wait(gfp_mask); + } + + trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); + return 0; } static int -_xfs_buf_alloc( +xfs_buf_alloc( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -159,7 +268,7 @@ _xfs_buf_alloc( * We don't want certain flags to appear in b_flags unless they are * specifically set by later operations on the buffer. */ - flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); + flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); /* * A new buffer is held and locked by the owner. This ensures that the @@ -179,15 +288,14 @@ _xfs_buf_alloc( bp->b_target = target; bp->b_mount = target->bt_mount; bp->b_flags = flags; - - error = xfs_buf_get_maps(bp, nmaps); - if (error) { - kmem_cache_free(xfs_buf_cache, bp); - return error; - } - bp->b_rhash_key = map[0].bm_bn; bp->b_length = 0; + bp->b_map_count = nmaps; + if (nmaps == 1) + bp->b_maps = &bp->__b_map; + else + bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); for (i = 0; i < nmaps; i++) { bp->b_maps[i].bm_bn = map[i].bm_bn; bp->b_maps[i].bm_len = map[i].bm_len; @@ -200,195 +308,13 @@ _xfs_buf_alloc( XFS_STATS_INC(bp->b_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); - *bpp = bp; - return 0; -} - -static void -xfs_buf_free_pages( - struct xfs_buf *bp) -{ - uint i; - - ASSERT(bp->b_flags & _XBF_PAGES); - - if (xfs_buf_is_vmapped(bp)) - vm_unmap_ram(bp->b_addr, bp->b_page_count); - - for (i = 0; i < bp->b_page_count; i++) { - if (bp->b_pages[i]) - __free_page(bp->b_pages[i]); - } - mm_account_reclaimed_pages(bp->b_page_count); - - if (bp->b_pages != bp->b_page_array) - kfree(bp->b_pages); - bp->b_pages = NULL; - bp->b_flags &= ~_XBF_PAGES; -} - -static void -xfs_buf_free_callback( - struct callback_head *cb) -{ - struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); - - xfs_buf_free_maps(bp); - kmem_cache_free(xfs_buf_cache, bp); -} - -static void -xfs_buf_free( - struct xfs_buf *bp) -{ - trace_xfs_buf_free(bp, _RET_IP_); - - ASSERT(list_empty(&bp->b_lru)); - - if (xfs_buftarg_is_mem(bp->b_target)) - xmbuf_unmap_page(bp); - else if (bp->b_flags & _XBF_PAGES) - xfs_buf_free_pages(bp); - else if (bp->b_flags & _XBF_KMEM) - kfree(bp->b_addr); - - call_rcu(&bp->b_rcu, xfs_buf_free_callback); -} - -static int -xfs_buf_alloc_kmem( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL; - size_t size = BBTOB(bp->b_length); - - /* Assure zeroed buffer for non-read cases. */ - if (!(flags & XBF_READ)) - gfp_mask |= __GFP_ZERO; - - bp->b_addr = kmalloc(size, gfp_mask); - if (!bp->b_addr) - return -ENOMEM; - - if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != - ((unsigned long)bp->b_addr & PAGE_MASK)) { - /* b_addr spans two pages - use alloc_page instead */ - kfree(bp->b_addr); - bp->b_addr = NULL; - return -ENOMEM; - } - bp->b_offset = offset_in_page(bp->b_addr); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = kmem_to_page(bp->b_addr); - bp->b_page_count = 1; - bp->b_flags |= _XBF_KMEM; - return 0; -} - -static int -xfs_buf_alloc_pages( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; - long filled = 0; - - if (flags & XBF_READ_AHEAD) - gfp_mask |= __GFP_NORETRY; - - /* Make sure that we have a page list */ - bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); - if (bp->b_page_count <= XB_PAGES) { - bp->b_pages = bp->b_page_array; - } else { - bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, - gfp_mask); - if (!bp->b_pages) - return -ENOMEM; - } - bp->b_flags |= _XBF_PAGES; - - /* Assure zeroed buffer for non-read cases. */ - if (!(flags & XBF_READ)) - gfp_mask |= __GFP_ZERO; - - /* - * Bulk filling of pages can take multiple calls. Not filling the entire - * array is not an allocation failure, so don't back off if we get at - * least one extra page. - */ - for (;;) { - long last = filled; - - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, - bp->b_pages); - if (filled == bp->b_page_count) { - XFS_STATS_INC(bp->b_mount, xb_page_found); - break; - } - - if (filled != last) - continue; - - if (flags & XBF_READ_AHEAD) { - xfs_buf_free_pages(bp); - return -ENOMEM; - } - - XFS_STATS_INC(bp->b_mount, xb_page_retries); - memalloc_retry_wait(gfp_mask); - } - return 0; -} - -/* - * Map buffer into kernel address-space if necessary. - */ -STATIC int -_xfs_buf_map_pages( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - ASSERT(bp->b_flags & _XBF_PAGES); - if (bp->b_page_count == 1) { - /* A single page buffer is always mappable */ - bp->b_addr = page_address(bp->b_pages[0]); - } else if (flags & XBF_UNMAPPED) { - bp->b_addr = NULL; - } else { - int retried = 0; - unsigned nofs_flag; - - /* - * vm_map_ram() will allocate auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we often under a scoped nofs - * context here. Mixing GFP_KERNEL with GFP_NOFS allocations - * from the same call site that can be run from both above and - * below memory reclaim causes lockdep false positives. Hence we - * always need to force this allocation to nofs context because - * we can't pass __GFP_NOLOCKDEP down to auxillary structures to - * prevent false positive lockdep reports. - * - * XXX(dgc): I think dquot reclaim is the only place we can get - * to this function from memory reclaim context now. If we fix - * that like we've fixed inode reclaim to avoid writeback from - * reclaim, this nofs wrapping can go away. - */ - nofs_flag = memalloc_nofs_save(); - do { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1); - if (bp->b_addr) - break; - vm_unmap_aliases(); - } while (retried++ <= 1); - memalloc_nofs_restore(nofs_flag); - - if (!bp->b_addr) - return -ENOMEM; + error = xfs_buf_alloc_backing_mem(bp, flags); + if (error) { + xfs_buf_free(bp); + return error; } + *bpp = bp; return 0; } @@ -507,7 +433,7 @@ xfs_buf_find_lock( return -ENOENT; } ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM; bp->b_ops = NULL; } return 0; @@ -575,25 +501,10 @@ xfs_buf_find_insert( struct xfs_buf *bp; int error; - error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); + error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); if (error) goto out_drop_pag; - if (xfs_buftarg_is_mem(new_bp->b_target)) { - error = xmbuf_map_page(new_bp); - } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE || - xfs_buf_alloc_kmem(new_bp, flags) < 0) { - /* - * For buffers that fit entirely within a single page, first - * attempt to allocate the memory from the heap to minimise - * memory usage. If we can't get heap memory for these small - * buffers, we fall back to using the page allocator. - */ - error = xfs_buf_alloc_pages(new_bp, flags); - } - if (error) - goto out_free_buf; - /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; @@ -704,18 +615,6 @@ xfs_buf_get_map( xfs_perag_put(pag); } - /* We do not hold a perag reference anymore. */ - if (!bp->b_addr) { - error = _xfs_buf_map_pages(bp, flags); - if (unlikely(error)) { - xfs_warn_ratelimited(btp->bt_mount, - "%s: failed to map %u pages", __func__, - bp->b_page_count); - xfs_buf_relse(bp); - return error; - } - } - /* * Clear b_error if this is a lookup from a caller that doesn't expect * valid data to be found in the buffer. @@ -903,7 +802,6 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { @@ -912,7 +810,7 @@ xfs_buf_read_uncached( *bpp = NULL; - error = xfs_buf_get_uncached(target, numblks, flags, &bp); + error = xfs_buf_get_uncached(target, numblks, &bp); if (error) return error; @@ -938,42 +836,14 @@ int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp) { int error; - struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - /* there are currently no valid flags for xfs_buf_get_uncached */ - ASSERT(flags == 0); - - *bpp = NULL; - - error = _xfs_buf_alloc(target, &map, 1, flags, &bp); - if (error) - return error; - - if (xfs_buftarg_is_mem(bp->b_target)) - error = xmbuf_map_page(bp); - else - error = xfs_buf_alloc_pages(bp, flags); - if (error) - goto fail_free_buf; - - error = _xfs_buf_map_pages(bp, 0); - if (unlikely(error)) { - xfs_warn(target->bt_mount, - "%s: failed to map pages", __func__); - goto fail_free_buf; - } - - trace_xfs_buf_get_uncached(bp, _RET_IP_); - *bpp = bp; - return 0; - -fail_free_buf: - xfs_buf_free(bp); + error = xfs_buf_alloc(target, &map, 1, 0, bpp); + if (!error) + trace_xfs_buf_get_uncached(*bpp, _RET_IP_); return error; } @@ -1299,9 +1169,9 @@ __xfs_buf_ioend( trace_xfs_buf_iodone(bp, _RET_IP_); if (bp->b_flags & XBF_READ) { - if (!bp->b_error && xfs_buf_is_vmapped(bp)) + if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) invalidate_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); + roundup(BBTOB(bp->b_length), PAGE_SIZE)); if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) @@ -1462,29 +1332,48 @@ static void xfs_buf_submit_bio( struct xfs_buf *bp) { - unsigned int size = BBTOB(bp->b_length); - unsigned int map = 0, p; + unsigned int map = 0; struct blk_plug plug; struct bio *bio; - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, - xfs_buf_bio_op(bp), GFP_NOIO); - bio->bi_private = bp; - bio->bi_end_io = xfs_buf_bio_end_io; + if (is_vmalloc_addr(bp->b_addr)) { + unsigned int size = BBTOB(bp->b_length); + unsigned int alloc_size = roundup(size, PAGE_SIZE); + void *data = bp->b_addr; - if (bp->b_flags & _XBF_KMEM) { - __bio_add_page(bio, virt_to_page(bp->b_addr), size, - bp->b_offset); - } else { - for (p = 0; p < bp->b_page_count; p++) - __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); - bio->bi_iter.bi_size = size; /* limit to the actual size used */ + bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, + xfs_buf_bio_op(bp), GFP_NOIO); + + do { + unsigned int len = min(size, PAGE_SIZE); + + ASSERT(offset_in_page(data) == 0); + __bio_add_page(bio, vmalloc_to_page(data), len, 0); + data += len; + size -= len; + } while (size); - if (xfs_buf_is_vmapped(bp)) - flush_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); + flush_kernel_vmap_range(bp->b_addr, alloc_size); + } else { + /* + * Single folio or slab allocation. Must be contiguous and thus + * only a single bvec is needed. + * + * This uses the page based bio add helper for now as that is + * the lowest common denominator between folios and slab + * allocations. To be replaced with a better block layer + * helper soon (hopefully). + */ + bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), + GFP_NOIO); + __bio_add_page(bio, virt_to_page(bp->b_addr), + BBTOB(bp->b_length), + offset_in_page(bp->b_addr)); } + bio->bi_private = bp; + bio->bi_end_io = xfs_buf_bio_end_io; + /* * If there is more than one map segment, split out a new bio for each * map except of the last one. The last map is handled by the @@ -1611,47 +1500,6 @@ xfs_buf_submit( xfs_buf_submit_bio(bp); } -void * -xfs_buf_offset( - struct xfs_buf *bp, - size_t offset) -{ - struct page *page; - - if (bp->b_addr) - return bp->b_addr + offset; - - page = bp->b_pages[offset >> PAGE_SHIFT]; - return page_address(page) + (offset & (PAGE_SIZE-1)); -} - -void -xfs_buf_zero( - struct xfs_buf *bp, - size_t boff, - size_t bsize) -{ - size_t bend; - - bend = boff + bsize; - while (boff < bend) { - struct page *page; - int page_index, page_offset, csize; - - page_index = (boff + bp->b_offset) >> PAGE_SHIFT; - page_offset = (boff + bp->b_offset) & ~PAGE_MASK; - page = bp->b_pages[page_index]; - csize = min_t(size_t, PAGE_SIZE - page_offset, - BBTOB(bp->b_length) - boff); - - ASSERT((csize + page_offset) <= PAGE_SIZE); - - memset(page_address(page) + page_offset, 0, csize); - - boff += csize; - } -} - /* * Log a message about and stale a buffer that a caller has decided is corrupt. * diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 80e06eecaf56..d0b065a9a9f0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -36,7 +36,6 @@ struct xfs_buf; #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ -#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1u << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ @@ -48,7 +47,6 @@ struct xfs_buf; #define XBF_LIVESCAN (1u << 28) #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ typedef unsigned int xfs_buf_flags_t; @@ -62,14 +60,12 @@ typedef unsigned int xfs_buf_flags_t; { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ - { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ { XBF_LIVESCAN, "LIVESCAN" }, \ { XBF_INCORE, "INCORE" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, \ - { XBF_UNMAPPED, "UNMAPPED" } + { XBF_TRYLOCK, "TRYLOCK" } /* * Internal state flags. @@ -124,8 +120,6 @@ struct xfs_buftarg { struct xfs_buf_cache bt_cache[]; }; -#define XB_PAGES 2 - struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ @@ -187,15 +181,10 @@ struct xfs_buf { struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; - struct page **b_pages; /* array of page pointers */ - struct page *b_page_array[XB_PAGES]; /* inline pages */ struct xfs_buf_map *b_maps; /* compound buffer map */ struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; atomic_t b_pin_count; /* pin count */ - unsigned int b_page_count; /* size of page array */ - unsigned int b_offset; /* page offset of b_addr, - only for _XBF_KMEM buffers */ int b_error; /* error code on I/O */ void (*b_iodone)(struct xfs_buf *bp); @@ -284,9 +273,9 @@ xfs_buf_readahead( } int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp); + struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, + size_t numblks, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); int _xfs_buf_read(struct xfs_buf *bp); void xfs_buf_hold(struct xfs_buf *bp); @@ -315,12 +304,20 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); -void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ -extern void *xfs_buf_offset(struct xfs_buf *, size_t); +static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset) +{ + return bp->b_addr + offset; +} + +static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize) +{ + memset(bp->b_addr + boff, 0, bsize); +} + extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 47549cfa61cd..19eb0b7a3e58 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -57,24 +57,6 @@ xfs_buf_log_format_size( (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); } -static inline bool -xfs_buf_item_straddle( - struct xfs_buf *bp, - uint offset, - int first_bit, - int nbits) -{ - void *first, *last; - - first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT)); - last = xfs_buf_offset(bp, - offset + ((first_bit + nbits) << XFS_BLF_SHIFT)); - - if (last - first != nbits * XFS_BLF_CHUNK) - return true; - return false; -} - /* * Return the number of log iovecs and space needed to log the given buf log * item segment. @@ -91,11 +73,8 @@ xfs_buf_item_size_segment( int *nvecs, int *nbytes) { - struct xfs_buf *bp = bip->bli_buf; int first_bit; int nbits; - int next_bit; - int last_bit; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (first_bit == -1) @@ -108,15 +87,6 @@ xfs_buf_item_size_segment( nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); - - /* - * Straddling a page is rare because we don't log contiguous - * chunks of unmapped buffers anywhere. - */ - if (nbits > 1 && - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) - goto slow_scan; - (*nvecs)++; *nbytes += nbits * XFS_BLF_CHUNK; @@ -131,40 +101,6 @@ xfs_buf_item_size_segment( } while (first_bit != -1); return; - -slow_scan: - /* Count the first bit we jumped out of the above loop from */ - (*nvecs)++; - *nbytes += XFS_BLF_CHUNK; - last_bit = first_bit; - while (last_bit != -1) { - /* - * This takes the bit number to start looking from and - * returns the next set bit from there. It returns -1 - * if there are no more bits set or the start bit is - * beyond the end of the bitmap. - */ - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, - last_bit + 1); - /* - * If we run out of bits, leave the loop, - * else if we find a new set of bits bump the number of vecs, - * else keep scanning the current set of bits. - */ - if (next_bit == -1) { - break; - } else if (next_bit != last_bit + 1 || - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { - last_bit = next_bit; - first_bit = next_bit; - (*nvecs)++; - nbits = 1; - } else { - last_bit++; - nbits++; - } - *nbytes += XFS_BLF_CHUNK; - } } /* @@ -277,8 +213,6 @@ xfs_buf_item_format_segment( struct xfs_buf *bp = bip->bli_buf; uint base_size; int first_bit; - int last_bit; - int next_bit; uint nbits; /* copy the flags across from the base format item */ @@ -323,15 +257,6 @@ xfs_buf_item_format_segment( nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); - - /* - * Straddling a page is rare because we don't log contiguous - * chunks of unmapped buffers anywhere. - */ - if (nbits > 1 && - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) - goto slow_scan; - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, first_bit, nbits); blfp->blf_size++; @@ -347,45 +272,6 @@ xfs_buf_item_format_segment( } while (first_bit != -1); return; - -slow_scan: - ASSERT(bp->b_addr == NULL); - last_bit = first_bit; - nbits = 1; - for (;;) { - /* - * This takes the bit number to start looking from and - * returns the next set bit from there. It returns -1 - * if there are no more bits set or the start bit is - * beyond the end of the bitmap. - */ - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, - (uint)last_bit + 1); - /* - * If we run out of bits fill in the last iovec and get out of - * the loop. Else if we start a new set of bits then fill in - * the iovec for the series we were looking at and start - * counting the bits in the new one. Else we're still in the - * same set of bits so just keep counting and scanning. - */ - if (next_bit == -1) { - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, - first_bit, nbits); - blfp->blf_size++; - break; - } else if (next_bit != last_bit + 1 || - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, - first_bit, nbits); - blfp->blf_size++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else { - last_bit++; - nbits++; - } - } } /* diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 05a2f6927c12..d4c5cef5bc43 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -1006,7 +1006,6 @@ xlog_recover_buf_commit_pass2( struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; int error; - uint buf_flags; xfs_lsn_t lsn; /* @@ -1025,13 +1024,8 @@ xlog_recover_buf_commit_pass2( } trace_xfs_log_recover_buf_recover(log, buf_f); - - buf_flags = 0; - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) - buf_flags |= XBF_UNMAPPED; - error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, &bp, NULL); + 0, &bp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 5b64a2b3b113..b4ffd80b7cb6 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -74,7 +74,7 @@ xmbuf_alloc( /* * We don't want to bother with kmapping data during repair, so don't - * allow highmem pages to back this mapping. + * allow highmem folios to back this mapping. */ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); @@ -127,14 +127,13 @@ xmbuf_free( kfree(btp); } -/* Directly map a shmem page into the buffer cache. */ +/* Directly map a shmem folio into the buffer cache. */ int -xmbuf_map_page( +xmbuf_map_backing_mem( struct xfs_buf *bp) { struct inode *inode = file_inode(bp->b_target->bt_file); struct folio *folio = NULL; - struct page *page; loff_t pos = BBTOB(xfs_buf_daddr(bp)); int error; @@ -159,39 +158,17 @@ xmbuf_map_page( return -EIO; } - page = folio_file_page(folio, pos >> PAGE_SHIFT); - /* - * Mark the page dirty so that it won't be reclaimed once we drop the - * (potentially last) reference in xmbuf_unmap_page. + * Mark the folio dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xfs_buf_free. */ - set_page_dirty(page); - unlock_page(page); + folio_set_dirty(folio); + folio_unlock(folio); - bp->b_addr = page_address(page); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = page; - bp->b_page_count = 1; + bp->b_addr = folio_address(folio); return 0; } -/* Unmap a shmem page that was mapped into the buffer cache. */ -void -xmbuf_unmap_page( - struct xfs_buf *bp) -{ - struct page *page = bp->b_pages[0]; - - ASSERT(xfs_buftarg_is_mem(bp->b_target)); - - put_page(page); - - bp->b_addr = NULL; - bp->b_pages[0] = NULL; - bp->b_pages = NULL; - bp->b_page_count = 0; -} - /* Is this a valid daddr within the buftarg? */ bool xmbuf_verify_daddr( @@ -205,7 +182,7 @@ xmbuf_verify_daddr( return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); } -/* Discard the page backing this buffer. */ +/* Discard the folio backing this buffer. */ static void xmbuf_stale( struct xfs_buf *bp) @@ -220,7 +197,7 @@ xmbuf_stale( } /* - * Finalize a buffer -- discard the backing page if it's stale, or run the + * Finalize a buffer -- discard the backing folio if it's stale, or run the * write verifier to detect problems. */ int diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h index eed4a7b63232..67d525cc1513 100644 --- a/fs/xfs/xfs_buf_mem.h +++ b/fs/xfs/xfs_buf_mem.h @@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr, struct xfs_buftarg **btpp); void xmbuf_free(struct xfs_buftarg *btp); -int xmbuf_map_page(struct xfs_buf *bp); -void xmbuf_unmap_page(struct xfs_buf *bp); bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr); void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); int xmbuf_finalize(struct xfs_buf *bp); #else # define xfs_buftarg_is_mem(...) (false) -# define xmbuf_map_page(...) (-ENOMEM) -# define xmbuf_unmap_page(...) ((void)0) # define xmbuf_verify_daddr(...) (false) #endif /* CONFIG_XFS_MEMORY_BUFS */ +int xmbuf_map_backing_mem(struct xfs_buf *bp); + #endif /* __XFS_BUF_MEM_H__ */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 3f2403a7b49c..c1a306268ae4 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -844,7 +844,8 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_rtdev_targp && + + if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) rt_bdev = mp->m_rtdev_targp->bt_bdev; if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ea43c9a6e54c..da3161572735 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -671,7 +671,7 @@ xfs_extent_busy_wait_all( while ((pag = xfs_perag_next(mp, pag))) xfs_extent_busy_wait_group(pag_group(pag)); - if (xfs_has_rtgroups(mp)) + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) while ((rtg = xfs_rtgroup_next(mp, rtg))) xfs_extent_busy_wait_group(rtg_group(rtg)); } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a25c713ff888..777438b853da 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -29,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_efi_cache; struct kmem_cache *xfs_efd_cache; @@ -767,21 +768,35 @@ xfs_rtextent_free_finish_item( trace_xfs_extent_free_deferred(mp, xefi); - if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) { - if (*rtgp != to_rtg(xefi->xefi_group)) { - *rtgp = to_rtg(xefi->xefi_group); - xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP); - xfs_rtgroup_trans_join(tp, *rtgp, - XFS_RTGLOCK_BITMAP); - } - error = xfs_rtfree_blocks(tp, *rtgp, - xefi->xefi_startblock, xefi->xefi_blockcount); + if (xefi->xefi_flags & XFS_EFI_CANCELLED) + goto done; + + if (*rtgp != to_rtg(xefi->xefi_group)) { + unsigned int lock_flags; + + if (xfs_has_zoned(mp)) + lock_flags = XFS_RTGLOCK_RMAP; + else + lock_flags = XFS_RTGLOCK_BITMAP; + + *rtgp = to_rtg(xefi->xefi_group); + xfs_rtgroup_lock(*rtgp, lock_flags); + xfs_rtgroup_trans_join(tp, *rtgp, lock_flags); } + + if (xfs_has_zoned(mp)) { + error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } else { + error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } + if (error == -EAGAIN) { xfs_efd_from_efi(efdp); return error; } - +done: xfs_efd_add_extent(efdp, xefi); xfs_extent_free_cancel_item(item); return error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9a435b1ff264..84f08c976ac4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -25,6 +25,8 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" #include "xfs_file.h" +#include "xfs_aops.h" +#include "xfs_zone_alloc.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -150,7 +152,7 @@ xfs_file_fsync( * ensure newly written file data make it to disk before logging the new * inode size in case of an extending write. */ - if (XFS_IS_REALTIME_INODE(ip)) + if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); else if (mp->m_logdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); @@ -360,7 +362,8 @@ xfs_file_write_zero_eof( struct iov_iter *from, unsigned int *iolock, size_t count, - bool *drained_dio) + bool *drained_dio, + struct xfs_zone_alloc_ctx *ac) { struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); loff_t isize; @@ -414,7 +417,7 @@ xfs_file_write_zero_eof( trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); return error; @@ -431,7 +434,8 @@ STATIC ssize_t xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - unsigned int *iolock) + unsigned int *iolock, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); @@ -481,7 +485,7 @@ restart: */ if (iocb->ki_pos > i_size_read(inode)) { error = xfs_file_write_zero_eof(iocb, from, iolock, count, - &drained_dio); + &drained_dio, ac); if (error == 1) goto restart; if (error) @@ -491,6 +495,48 @@ restart: return kiocb_modified(iocb); } +static ssize_t +xfs_zoned_write_space_reserve( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + loff_t count = iov_iter_count(from); + int error; + + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= XFS_ZR_NOWAIT; + + /* + * Check the rlimit and LFS boundary first so that we don't over-reserve + * by possibly a lot. + * + * The generic write path will redo this check later, and it might have + * changed by then. If it got expanded we'll stick to our earlier + * smaller limit, and if it is decreased the new smaller limit will be + * used and our extra space reservation will be returned after finishing + * the write. + */ + error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); + if (error) + return error; + + /* + * Sloppily round up count to file system blocks. + * + * This will often reserve an extra block, but that avoids having to look + * at the start offset, which isn't stable for O_APPEND until taking the + * iolock. Also we need to reserve a block each for zeroing the old + * EOF block and the new start block if they are unaligned. + * + * Any remaining block will be returned after the write. + */ + return xfs_zoned_space_reserve(ip, + XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); +} + static int xfs_dio_write_end_io( struct kiocb *iocb, @@ -503,6 +549,9 @@ xfs_dio_write_end_io( loff_t offset = iocb->ki_pos; unsigned int nofs_flag; + ASSERT(!xfs_is_zoned_inode(ip) || + !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + trace_xfs_end_io_direct_write(ip, offset, size); if (xfs_is_shutdown(ip->i_mount)) @@ -582,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { .end_io = xfs_dio_write_end_io, }; +static void +xfs_dio_zoned_submit_io( + const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; + struct xfs_zone_alloc_ctx *ac = iter->private; + xfs_filblks_t count_fsb; + struct iomap_ioend *ioend; + + count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); + if (count_fsb > ac->reserved_blocks) { + xfs_err(mp, +"allocation (%lld) larger than reservation (%lld).", + count_fsb, ac->reserved_blocks); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + bio_io_error(bio); + return; + } + ac->reserved_blocks -= count_fsb; + + bio->bi_end_io = xfs_end_bio; + ioend = iomap_init_ioend(iter->inode, bio, file_offset, + IOMAP_IOEND_DIRECT); + xfs_zone_alloc_and_submit(ioend, &ac->open_zone); +} + +static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { + .bio_set = &iomap_ioend_bioset, + .submit_io = xfs_dio_zoned_submit_io, + .end_io = xfs_dio_write_end_io, +}; + /* - * Handle block aligned direct I/O writes + * Handle block aligned direct I/O writes. */ static noinline ssize_t xfs_file_dio_write_aligned( struct xfs_inode *ip, struct kiocb *iocb, - struct iov_iter *from) + struct iov_iter *from, + const struct iomap_ops *ops, + const struct iomap_dio_ops *dops, + struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; @@ -597,7 +683,7 @@ xfs_file_dio_write_aligned( ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, ac); if (ret) goto out_unlock; @@ -611,11 +697,31 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0, NULL, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); out_unlock: - if (iolock) - xfs_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); + return ret; +} + +/* + * Handle block aligned direct I/O writes to zoned devices. + */ +static noinline ssize_t +xfs_file_dio_write_zoned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); + if (ret < 0) + return ret; + ret = xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_zoned_direct_write_iomap_ops, + &xfs_dio_zoned_write_ops, &ac); + xfs_zoned_space_unreserve(ip, &ac); return ret; } @@ -675,7 +781,7 @@ retry_exclusive: goto out_unlock; } - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out_unlock; @@ -721,9 +827,21 @@ xfs_file_dio_write( /* direct I/O must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) + + /* + * For always COW inodes we also must check the alignment of each + * individual iovec segment, as they could end up with different + * I/Os due to the way bio_iov_iter_get_pages works, and we'd + * then overwrite an already written block. + */ + if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || + (xfs_is_always_cow_inode(ip) && + (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) return xfs_file_dio_write_unaligned(ip, iocb, from); - return xfs_file_dio_write_aligned(ip, iocb, from); + if (xfs_is_zoned_inode(ip)) + return xfs_file_dio_write_zoned(ip, iocb, from); + return xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); } static noinline ssize_t @@ -740,7 +858,7 @@ xfs_file_dax_write( ret = xfs_ilock_iocb(iocb, iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -784,7 +902,7 @@ write_retry: if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -832,6 +950,67 @@ out: } STATIC ssize_t +xfs_file_buffered_write_zoned( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + unsigned int iolock = XFS_IOLOCK_EXCL; + bool cleared_space = false; + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); + if (ret < 0) + return ret; + + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + goto out_unreserve; + + ret = xfs_file_write_checks(iocb, from, &iolock, &ac); + if (ret) + goto out_unlock; + + /* + * Truncate the iter to the length that we were actually able to + * allocate blocks for. This needs to happen after + * xfs_file_write_checks, because that assigns ki_pos for O_APPEND + * writes. + */ + iov_iter_truncate(from, + XFS_FSB_TO_B(mp, ac.reserved_blocks) - + (iocb->ki_pos & mp->m_blockmask)); + if (!iov_iter_count(from)) + goto out_unlock; + +retry: + trace_xfs_file_buffered_write(iocb, from); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops, &ac); + if (ret == -ENOSPC && !cleared_space) { + /* + * Kick off writeback to convert delalloc space and release the + * usually too pessimistic indirect block reservations. + */ + xfs_flush_inodes(mp); + cleared_space = true; + goto retry; + } + +out_unlock: + xfs_iunlock(ip, iolock); +out_unreserve: + xfs_zoned_space_unreserve(ip, &ac); + if (ret > 0) { + XFS_STATS_ADD(mp, xs_write_bytes, ret); + ret = generic_write_sync(iocb, ret); + } + return ret; +} + +STATIC ssize_t xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) @@ -878,6 +1057,8 @@ xfs_file_write_iter( return ret; } + if (xfs_is_zoned_inode(ip)) + return xfs_file_buffered_write_zoned(iocb, from); return xfs_file_buffered_write(iocb, from); } @@ -932,7 +1113,8 @@ static int xfs_falloc_collapse_range( struct file *file, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); loff_t new_size = i_size_read(inode) - len; @@ -948,7 +1130,7 @@ xfs_falloc_collapse_range( if (offset + len >= i_size_read(inode)) return -EINVAL; - error = xfs_collapse_file_space(XFS_I(inode), offset, len); + error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); if (error) return error; return xfs_falloc_setsize(file, new_size); @@ -1004,7 +1186,8 @@ xfs_falloc_zero_range( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); unsigned int blksize = i_blocksize(inode); @@ -1017,7 +1200,7 @@ xfs_falloc_zero_range( if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len); + error = xfs_free_file_space(XFS_I(inode), offset, len, ac); if (error) return error; @@ -1088,22 +1271,18 @@ xfs_falloc_allocate_range( FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) STATIC long -xfs_file_fallocate( +__xfs_file_fallocate( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - if (mode & ~XFS_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - xfs_ilock(ip, iolock); error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) @@ -1124,16 +1303,16 @@ xfs_file_fallocate( switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_PUNCH_HOLE: - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); break; case FALLOC_FL_COLLAPSE_RANGE: - error = xfs_falloc_collapse_range(file, offset, len); + error = xfs_falloc_collapse_range(file, offset, len, ac); break; case FALLOC_FL_INSERT_RANGE: error = xfs_falloc_insert_range(file, offset, len); break; case FALLOC_FL_ZERO_RANGE: - error = xfs_falloc_zero_range(file, mode, offset, len); + error = xfs_falloc_zero_range(file, mode, offset, len, ac); break; case FALLOC_FL_UNSHARE_RANGE: error = xfs_falloc_unshare_range(file, mode, offset, len); @@ -1154,6 +1333,54 @@ out_unlock: return error; } +static long +xfs_file_zoned_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct xfs_zone_alloc_ctx ac = { }; + struct xfs_inode *ip = XFS_I(file_inode(file)); + int error; + + error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); + if (error) + return error; + error = __xfs_file_fallocate(file, mode, offset, len, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return error; +} + +static long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~XFS_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* + * For zoned file systems, zeroing the first and last block of a hole + * punch requires allocating a new block to rewrite the remaining data + * and new zeroes out of place. Get a reservations for those before + * taking the iolock. Dip into the reserved pool because we are + * expected to be able to punch a hole even on a completely full + * file system. + */ + if (xfs_is_zoned_inode(XFS_I(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_COLLAPSE_RANGE))) + return xfs_file_zoned_fallocate(file, mode, offset, len); + return __xfs_file_fallocate(file, mode, offset, len, NULL); +} + STATIC int xfs_file_fadvise( struct file *file, @@ -1347,15 +1574,22 @@ xfs_file_release( * blocks. This avoids open/read/close workloads from removing EOF * blocks that other writers depend upon to reduce fragmentation. * + * Inodes on the zoned RT device never have preallocations, so skip + * taking the locks below. + */ + if (!inode->i_nlink || + !(file->f_mode & FMODE_WRITE) || + (ip->i_diflags & XFS_DIFLAG_APPEND) || + xfs_is_zoned_inode(ip)) + return 0; + + /* * If we can't get the iolock just skip truncating the blocks past EOF * because we could deadlock with the mmap_lock otherwise. We'll get * another chance to drop them once the last reference to the inode is * dropped, so we'll never leak blocks permanently. */ - if (inode->i_nlink && - (file->f_mode & FMODE_WRITE) && - !(ip->i_diflags & XFS_DIFLAG_APPEND) && - !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (xfs_can_free_eofblocks(ip) && !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) @@ -1469,9 +1703,10 @@ xfs_dax_read_fault( * i_lock (XFS - extent map serialisation) */ static vm_fault_t -xfs_write_fault( +__xfs_write_fault( struct vm_fault *vmf, - unsigned int order) + unsigned int order, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); @@ -1498,13 +1733,50 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + ac); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); return ret; } +static vm_fault_t +xfs_write_fault_zoned( + struct vm_fault *vmf, + unsigned int order) +{ + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + unsigned int len = folio_size(page_folio(vmf->page)); + struct xfs_zone_alloc_ctx ac = { }; + int error; + vm_fault_t ret; + + /* + * This could over-allocate as it doesn't check for truncation. + * + * But as the overallocation is limited to less than a folio and will be + * release instantly that's just fine. + */ + error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, + &ac); + if (error < 0) + return vmf_fs_error(error); + ret = __xfs_write_fault(vmf, order, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return ret; +} + +static vm_fault_t +xfs_write_fault( + struct vm_fault *vmf, + unsigned int order) +{ + if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) + return xfs_write_fault_zoned(vmf, order); + return __xfs_write_fault(vmf, order, NULL); +} + static inline bool xfs_is_write_fault( struct vm_fault *vmf) @@ -1613,7 +1885,8 @@ const struct file_operations xfs_file_operations = { .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 1dbd2d75f7ae..a4bc1642fe56 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt( struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = NULL; struct xfs_btree_cur *bt_cur = NULL; + xfs_daddr_t rtstart_daddr; xfs_rtblock_t start_rtb; xfs_rtblock_t end_rtb; xfs_rgnumber_t start_rg, end_rg; uint64_t eofs; int error = 0; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); if (keys[0].fmr_physical >= eofs) return 0; - start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical); - end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); + + rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); + if (keys[0].fmr_physical < rtstart_daddr) { + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_FS, + .len_daddr = rtstart_daddr, + }; + + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) { + info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length; + return 0; + } + + /* Fabricate an rmap entry for space occupied by the data dev */ + error = xfs_getfsmap_helper(tp, info, &frec); + if (error) + return error; + } + + start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical); + end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + + min(eofs - 1, keys[1].fmr_physical)); info->missing_owner = XFS_FMR_OWN_FREE; @@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt( } #endif /* CONFIG_XFS_RT */ +static uint32_t +xfs_getfsmap_device( + struct xfs_mount *mp, + enum xfs_device dev) +{ + if (mp->m_sb.sb_rtstart) + return dev; + + switch (dev) { + case XFS_DEV_DATA: + return new_encode_dev(mp->m_ddev_targp->bt_dev); + case XFS_DEV_LOG: + return new_encode_dev(mp->m_logdev_targp->bt_dev); + case XFS_DEV_RT: + if (!mp->m_rtdev_targp) + break; + return new_encode_dev(mp->m_rtdev_targp->bt_dev); + } + + return -1; +} + /* Do we recognize the device? */ STATIC bool xfs_getfsmap_is_valid_device( struct xfs_mount *mp, struct xfs_fsmap *fm) { - if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || - fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) - return true; - if (mp->m_logdev_targp && - fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) - return true; - if (mp->m_rtdev_targp && - fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) - return true; - return false; + return fm->fmr_device == 0 || + fm->fmr_device == UINT_MAX || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) || + (mp->m_rtdev_targp && + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT)); } /* Ensure that the low key is less than the high key. */ @@ -1126,7 +1166,7 @@ xfs_getfsmap( /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); - handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); + handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA); if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else @@ -1134,13 +1174,17 @@ xfs_getfsmap( if (mp->m_logdev_targp != mp->m_ddev_targp) { handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); + handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG); handlers[1].fn = xfs_getfsmap_logdev; } #ifdef CONFIG_XFS_RT - if (mp->m_rtdev_targp) { + /* + * For zoned file systems there is no rtbitmap, so only support fsmap + * if the callers is privileged enough to use the full rmap version. + */ + if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) { handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); + handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT); if (use_rmap) handlers[2].fn = xfs_getfsmap_rtdev_rmapbt; else @@ -1230,7 +1274,13 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); - head->fmh_oflags = FMH_OF_DEV_T; + + /* + * For internal RT device we need to report different synthetic devices + * for a single physical device, and thus can't report the actual dev_t. + */ + if (!mp->m_sb.sb_rtstart) + head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 455298503d01..0ada73569394 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,6 +24,7 @@ #include "xfs_rtalloc.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_metafile.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -110,7 +111,7 @@ xfs_growfs_data_private( if (nb > mp->m_sb.sb_dblocks) { error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), &bp, NULL); if (error) return error; xfs_buf_relse(bp); @@ -300,24 +301,30 @@ xfs_growfs_data( struct xfs_mount *mp, struct xfs_growfs_data *in) { - int error = 0; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; + /* we can't grow the data section when an internal RT section exists */ + if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) { + error = -EINVAL; + goto out_unlock; + } + /* update imaxpct separately to the physical grow of the filesystem */ if (in->imaxpct != mp->m_sb.sb_imax_pct) { error = xfs_growfs_imaxpct(mp, in->imaxpct); if (error) - goto out_error; + goto out_unlock; } if (in->newblocks != mp->m_sb.sb_dblocks) { error = xfs_growfs_data_private(mp, in); if (error) - goto out_error; + goto out_unlock; } /* Post growfs calculations needed to reflect new state in operations */ @@ -331,13 +338,12 @@ xfs_growfs_data( /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); -out_error: /* - * Increment the generation unconditionally, the error could be from - * updating the secondary superblocks, in which case the new size - * is live already. + * Increment the generation unconditionally, after trying to update the + * secondary superblocks, as the new size is live already at this point. */ mp->m_generation++; +out_unlock: mutex_unlock(&mp->m_growlock); return error; } @@ -366,6 +372,7 @@ xfs_growfs_log( int xfs_reserve_blocks( struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t request) { int64_t lcounter, delta; @@ -373,6 +380,8 @@ xfs_reserve_blocks( int64_t free; int error = 0; + ASSERT(ctr < XC_FREE_NR); + /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -391,16 +400,16 @@ xfs_reserve_blocks( * counters directly since we shouldn't have any problems unreserving * space. */ - if (mp->m_resblks > request) { - lcounter = mp->m_resblks_avail - request; + if (mp->m_free[ctr].res_total > request) { + lcounter = mp->m_free[ctr].res_avail - request; if (lcounter > 0) { /* release unused blocks */ fdblks_delta = lcounter; - mp->m_resblks_avail -= lcounter; + mp->m_free[ctr].res_avail -= lcounter; } - mp->m_resblks = request; + mp->m_free[ctr].res_total = request; if (fdblks_delta) { spin_unlock(&mp->m_sb_lock); - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } @@ -409,7 +418,7 @@ xfs_reserve_blocks( /* * If the request is larger than the current reservation, reserve the - * blocks before we update the reserve counters. Sample m_fdblocks and + * blocks before we update the reserve counters. Sample m_free and * perform a partial reservation if the request exceeds free space. * * The code below estimates how many blocks it can request from @@ -419,10 +428,10 @@ xfs_reserve_blocks( * space to fill it because mod_fdblocks will refill an undersized * reserve when it can. */ - free = percpu_counter_sum(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - delta = request - mp->m_resblks; - mp->m_resblks = request; + free = xfs_sum_freecounter_raw(mp, ctr) - + xfs_freecounter_unavailable(mp, ctr); + delta = request - mp->m_free[ctr].res_total; + mp->m_free[ctr].res_total = request; if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block @@ -436,9 +445,9 @@ xfs_reserve_blocks( */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); - error = xfs_dec_fdblocks(mp, fdblks_delta, 0); + error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0); if (!error) - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } out: @@ -558,15 +567,13 @@ xfs_fs_reserve_ag_blocks( return error; } - if (xfs_has_realtime(mp)) { - err2 = xfs_rt_resv_init(mp); - if (err2 && err2 != -ENOSPC) { - xfs_warn(mp, - "Error %d reserving realtime metadata reserve pool.", err2); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } + err2 = xfs_metafile_resv_init(mp); + if (err2 && err2 != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving realtime metadata reserve pool.", err2); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if (err2 && !error) + if (!error) error = err2; } @@ -582,9 +589,7 @@ xfs_fs_unreserve_ag_blocks( { struct xfs_perag *pag = NULL; - if (xfs_has_realtime(mp)) - xfs_rt_resv_free(mp); - + xfs_metafile_resv_free(mp); while ((pag = xfs_perag_next(mp, pag))) xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 3e2f73bcf831..9d23c361ef56 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,7 +8,8 @@ int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); +int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt, + uint64_t request); int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7b6c026d01a1..2f53ca7e12d4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -2073,10 +2073,10 @@ xfs_inodegc_want_queue_rt_file( { struct xfs_mount *mp = ip->i_mount; - if (!XFS_IS_REALTIME_INODE(ip)) + if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp)) return false; - if (__percpu_counter_compare(&mp->m_frextents, + if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; @@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work( if (items > mp->m_ino_geo.inodes_per_cluster) return true; - if (__percpu_counter_compare(&mp->m_fdblocks, + if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS, mp->m_low_space[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b1f9f156ec88..ce6b8ffbaa2c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1721,8 +1721,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * igeo->blocks_per_cluster, - XBF_UNMAPPED, &bp); + mp->m_bsize * igeo->blocks_per_cluster, 0, &bp); if (error) return error; @@ -3074,5 +3073,6 @@ bool xfs_is_always_cow_inode( const struct xfs_inode *ip) { - return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); + return xfs_is_zoned_inode(ip) || + (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount)); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c08093a65352..4bb7a99e0dc4 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -25,19 +25,9 @@ struct xfs_dquot; typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ - union { - struct { - struct xfs_dquot *i_udquot; /* user dquot */ - struct xfs_dquot *i_gdquot; /* group dquot */ - struct xfs_dquot *i_pdquot; /* project dquot */ - }; - - /* - * Space that has been set aside to accomodate expansions of a - * metadata btree rooted in this file. - */ - uint64_t i_meta_resv_asked; - }; + struct xfs_dquot *i_udquot; /* user dquot */ + struct xfs_dquot *i_gdquot; /* group dquot */ + struct xfs_dquot *i_pdquot; /* project dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ @@ -69,8 +59,13 @@ typedef struct xfs_inode { xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ prid_t i_projid; /* owner's project id */ xfs_extlen_t i_extsize; /* basic/minimum extent size */ - /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ + /* + * i_used_blocks is used for zoned rtrmap inodes, + * i_cowextsize is used for other v3 inodes, + * i_flushiter for v1/2 inodes + */ union { + uint32_t i_used_blocks; /* used blocks in RTG */ xfs_extlen_t i_cowextsize; /* basic cow extent size */ uint16_t i_flushiter; /* incremented on flush */ }; @@ -309,6 +304,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip) xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } +static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip) +{ + return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip); +} + bool xfs_is_always_cow_inode(const struct xfs_inode *ip); static inline bool xfs_is_cow_inode(const struct xfs_inode *ip) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 35803fcf0beb..40fc1bf900af 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -596,6 +596,7 @@ xfs_inode_to_log_dinode( to->di_changecount = inode_peek_iversion(inode); to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); to->di_flags2 = ip->i_diflags2; + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = ip->i_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index f3bfb814378c..7205fd14f6b3 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -203,6 +203,7 @@ xfs_log_dinode_to_disk( to->di_crtime = xfs_log_dinode_to_disk_ts(from, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ed85322507dd..d250f7f74e3b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_reserve_blocks(mp, fsop.resblks); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks); mnt_drop_write_file(filp); if (error) return error; } spin_lock(&mp->m_sb_lock); - fsop.resblks = mp->m_resblks; - fsop.resblks_avail = mp->m_resblks_avail; + fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total; + fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail; spin_unlock(&mp->m_sb_lock); if (copy_to_user(arg, &fsop, sizeof(fsop))) @@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts( struct xfs_fsop_counts out = { .allocino = percpu_counter_read_positive(&mp->m_icount), .freeino = percpu_counter_read_positive(&mp->m_ifree), - .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp), - .freertx = percpu_counter_read_positive(&mp->m_frextents), + .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) - + xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS), + .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS), }; if (copy_to_user(uarg, &out, sizeof(out))) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d61460309a78..cb23c8871f81 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -30,6 +30,8 @@ #include "xfs_reflink.h" #include "xfs_health.h" #include "xfs_rtbitmap.h" +#include "xfs_icache.h" +#include "xfs_zone_alloc.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -431,13 +433,14 @@ xfs_quota_calc_throttle( static int64_t xfs_iomap_freesp( - struct percpu_counter *counter, + struct xfs_mount *mp, + unsigned int idx, uint64_t low_space[XFS_LOWSP_MAX], int *shift) { int64_t freesp; - freesp = percpu_counter_read_positive(counter); + freesp = xfs_estimate_freecounter(mp, idx); if (freesp < low_space[XFS_LOWSP_5_PCNT]) { *shift = 2; if (freesp < low_space[XFS_LOWSP_4_PCNT]) @@ -536,10 +539,10 @@ xfs_iomap_prealloc_size( if (unlikely(XFS_IS_REALTIME_INODE(ip))) freesp = xfs_rtbxlen_to_blen(mp, - xfs_iomap_freesp(&mp->m_frextents, + xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts, &shift)); else - freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space, + freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space, &shift); /* @@ -828,6 +831,10 @@ xfs_direct_write_iomap_begin( if (offset + length > i_size_read(inode)) iomap_flags |= IOMAP_F_DIRTY; + /* HW-offload atomics are always used in this path */ + if (flags & IOMAP_ATOMIC) + iomap_flags |= IOMAP_F_ATOMIC_BIO; + /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. @@ -962,6 +969,59 @@ const struct iomap_ops xfs_direct_write_iomap_ops = { .iomap_begin = xfs_direct_write_iomap_begin, }; +#ifdef CONFIG_XFS_RT +/* + * This is really simple. The space has already been reserved before taking the + * IOLOCK, the actual block allocation is done just before submitting the bio + * and only recorded in the extent map on I/O completion. + */ +static int +xfs_zoned_direct_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(!(flags & IOMAP_OVERWRITE_ONLY)); + + /* + * Needs to be pushed down into the allocator so that only writes into + * a single zone can be supported. + */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + /* + * Ensure the extent list is in memory in so that we don't have to do + * read it from the I/O completion handler. + */ + if (xfs_need_iread_extents(&ip->i_df)) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + } + + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_DIRTY; + iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev; + iomap->offset = offset; + iomap->length = length; + iomap->flags = IOMAP_F_ANON_WRITE; + return 0; +} + +const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { + .iomap_begin = xfs_zoned_direct_write_iomap_begin, +}; +#endif /* CONFIG_XFS_RT */ + static int xfs_dax_write_iomap_end( struct inode *inode, @@ -987,6 +1047,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = { .iomap_end = xfs_dax_write_iomap_end, }; +/* + * Convert a hole to a delayed allocation. + */ +static void +xfs_bmap_add_extent_hole_delay( + struct xfs_inode *ip, /* incore inode pointer */ + int whichfork, + struct xfs_iext_cursor *icur, + struct xfs_bmbt_irec *new) /* new data to add to file extents */ +{ + struct xfs_ifork *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + uint32_t state = xfs_bmap_fork_to_state(whichfork); + xfs_filblks_t temp; /* temp for indirect calculations */ + + ifp = xfs_ifork_ptr(ip, whichfork); + ASSERT(isnullstartblock(new->br_startblock)); + + /* + * Check and set flags if this segment has a left neighbor + */ + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { + state |= BMAP_LEFT_VALID; + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (xfs_iext_get_extent(ifp, icur, &right)) { + state |= BMAP_RIGHT_VALID; + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the contiguity flags. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent record. + */ + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_startblock = nullstartblock(newlen); + left.br_blockcount = temp; + + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + temp = left.br_blockcount + new->br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_blockcount = temp; + left.br_startblock = nullstartblock(newlen); + + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + temp = new->br_blockcount + right.br_blockcount; + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + right.br_startoff = new->br_startoff; + right.br_startblock = nullstartblock(newlen); + right.br_blockcount = temp; + xfs_iext_update_extent(ip, state, icur, &right); + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_iext_insert(ip, icur, new, state); + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_add_fdblocks(ip->i_mount, oldlen - newlen); + + /* + * Nothing to do for disk quota accounting here. + */ + xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); + } +} + +/* + * Add a delayed allocation extent to an inode. Blocks are reserved from the + * global pool and the extent inserted into the inode in-core extent tree. + * + * On entry, got refers to the first extent beyond the offset of the extent to + * allocate or eof is specified if no such extent exists. On return, got refers + * to the extent record that was inserted to the inode fork. + * + * Note that the allocated extent may have been merged with contiguous extents + * during insertion into the inode fork. Thus, got does not reflect the current + * state of the inode fork on return. If necessary, the caller can use lastx to + * look up the updated record in the inode fork. + */ +static int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t off, + xfs_filblks_t len, + xfs_filblks_t prealloc, + struct xfs_bmbt_irec *got, + struct xfs_iext_cursor *icur, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + xfs_extlen_t alen; + xfs_extlen_t indlen; + uint64_t fdblocks; + int error; + xfs_fileoff_t aoff; + bool use_cowextszhint = + whichfork == XFS_COW_FORK && !prealloc; + +retry: + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ + aoff = off; + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; + + /* + * If we're targetting the COW fork but aren't creating a speculative + * posteof preallocation, try to expand the reservation to align with + * the COW extent size hint if there's sufficient free space. + * + * Unlike the data fork, the CoW cancellation functions will free all + * the reservations at inactivation, so we don't require that every + * delalloc reservation have a dirty pagecache. + */ + if (use_cowextszhint) { + struct xfs_bmbt_irec prev; + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); + + if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) + prev.br_startoff = NULLFILEOFF; + + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_quota_reserve_blkres(ip, alen); + if (error) + goto out; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + fdblocks = indlen; + if (XFS_IS_REALTIME_INODE(ip)) { + ASSERT(!xfs_is_zoned_inode(ip)); + error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); + if (error) + goto out_unreserve_quota; + } else { + fdblocks += alen; + } + + error = xfs_dec_fdblocks(mp, fdblocks, false); + if (error) + goto out_unreserve_frextents; + + ip->i_delayed_blks += alen; + xfs_mod_delalloc(ip, alen, indlen); + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + + xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); + + /* + * Tag the inode if blocks were preallocated. Note that COW fork + * preallocation can occur at the start or end of the extent, even when + * prealloc == 0, so we must also check the aligned offset and length. + */ + if (whichfork == XFS_DATA_FORK && prealloc) + xfs_inode_set_eofblocks_tag(ip); + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + + return 0; + +out_unreserve_frextents: + if (XFS_IS_REALTIME_INODE(ip)) + xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_quota_unreserve_blkres(ip, alen); +out: + if (error == -ENOSPC || error == -EDQUOT) { + trace_xfs_delalloc_enospc(ip, off, len); + + if (prealloc || use_cowextszhint) { + /* retry without any preallocation */ + use_cowextszhint = false; + prealloc = 0; + goto retry; + } + } + return error; +} + +static int +xfs_zoned_buffered_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + struct xfs_zone_alloc_ctx *ac = iter->private; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); + u16 iomap_flags = IOMAP_F_SHARED; + unsigned int lockmode = XFS_ILOCK_EXCL; + xfs_filblks_t count_fsb; + xfs_extlen_t indlen; + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + int error = 0; + + ASSERT(!xfs_get_extsz_hint(ip)); + ASSERT(!(flags & IOMAP_UNSHARE)); + ASSERT(ac); + + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_qm_dqattach(ip); + if (error) + return error; + + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); + error = -EFSCORRUPTED; + goto out_unlock; + } + + XFS_STATS_INC(mp, xs_blk_mapw); + + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + /* + * For zeroing operations check if there is any data to zero first. + * + * For regular writes we always need to allocate new blocks, but need to + * provide the source mapping when the range is unaligned to support + * read-modify-write of the whole block in the page cache. + * + * In either case we need to limit the reported range to the boundaries + * of the source map in the data fork. + */ + if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) || + !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) || + (flags & IOMAP_ZERO)) { + struct xfs_bmbt_irec smap; + struct xfs_iext_cursor scur; + + if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur, + &smap)) + smap.br_startoff = end_fsb; /* fake hole until EOF */ + if (smap.br_startoff > offset_fsb) { + /* + * We never need to allocate blocks for zeroing a hole. + */ + if (flags & IOMAP_ZERO) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, + smap.br_startoff); + goto out_unlock; + } + end_fsb = min(end_fsb, smap.br_startoff); + } else { + end_fsb = min(end_fsb, + smap.br_startoff + smap.br_blockcount); + xfs_trim_extent(&smap, offset_fsb, + end_fsb - offset_fsb); + error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0, + xfs_iomap_inode_sequence(ip, 0)); + if (error) + goto out_unlock; + } + } + + if (!ip->i_cowfp) + xfs_ifork_init_cow(ip); + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) + got.br_startoff = end_fsb; + if (got.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &got); + goto done; + } + + /* + * Cap the maximum length to keep the chunks of work done here somewhat + * symmetric with the work writeback does. + */ + end_fsb = min(end_fsb, got.br_startoff); + count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN, + XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); + + /* + * The block reservation is supposed to cover all blocks that the + * operation could possible write, but there is a nasty corner case + * where blocks could be stolen from underneath us: + * + * 1) while this thread iterates over a larger buffered write, + * 2) another thread is causing a write fault that calls into + * ->page_mkwrite in range this thread writes to, using up the + * delalloc reservation created by a previous call to this function. + * 3) another thread does direct I/O on the range that the write fault + * happened on, which causes writeback of the dirty data. + * 4) this then set the stale flag, which cuts the current iomap + * iteration short, causing the new call to ->iomap_begin that gets + * us here again, but now without a sufficient reservation. + * + * This is a very unusual I/O pattern, and nothing but generic/095 is + * known to hit it. There's not really much we can do here, so turn this + * into a short write. + */ + if (count_fsb > ac->reserved_blocks) { + xfs_warn_ratelimited(mp, +"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O", + ip->i_ino, current->comm); + count_fsb = ac->reserved_blocks; + if (!count_fsb) { + error = -EIO; + goto out_unlock; + } + } + + error = xfs_quota_reserve_blkres(ip, count_fsb); + if (error) + goto out_unlock; + + indlen = xfs_bmap_worst_indlen(ip, count_fsb); + error = xfs_dec_fdblocks(mp, indlen, false); + if (error) + goto out_unlock; + ip->i_delayed_blks += count_fsb; + xfs_mod_delalloc(ip, count_fsb, indlen); + + got.br_startoff = offset_fsb; + got.br_startblock = nullstartblock(indlen); + got.br_blockcount = count_fsb; + got.br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got); + ac->reserved_blocks -= count_fsb; + iomap_flags |= IOMAP_F_NEW; + + trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb), + XFS_COW_FORK, &got); +done: + error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags, + xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED)); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + static int xfs_buffered_write_iomap_begin( struct inode *inode, @@ -1013,6 +1522,10 @@ xfs_buffered_write_iomap_begin( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_is_zoned_inode(ip)) + return xfs_zoned_buffered_write_iomap_begin(inode, offset, + count, flags, iomap, srcmap); + /* we can't use delayed allocations when using extent size hints */ if (xfs_get_extsz_hint(ip)) return xfs_direct_write_iomap_begin(inode, offset, count, @@ -1245,10 +1758,13 @@ xfs_buffered_write_delalloc_punch( loff_t length, struct iomap *iomap) { + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + xfs_bmap_punch_delalloc_range(XFS_I(inode), (iomap->flags & IOMAP_F_SHARED) ? XFS_COW_FORK : XFS_DATA_FORK, - offset, offset + length); + offset, offset + length, iter->private); } static int @@ -1485,6 +2001,7 @@ xfs_zero_range( struct xfs_inode *ip, loff_t pos, loff_t len, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1495,13 +2012,14 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, ac); } int xfs_truncate_page( struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1510,5 +2028,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, ac); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8347268af727..d330c4a581b1 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -10,6 +10,7 @@ struct xfs_inode; struct xfs_bmbt_irec; +struct xfs_zone_alloc_ctx; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, @@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, - bool *did_zero); -int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); + struct xfs_zone_alloc_ctx *ac, bool *did_zero); +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero); static inline xfs_filblks_t xfs_aligned_fsb_count( @@ -49,6 +51,7 @@ xfs_aligned_fsb_count( extern const struct iomap_ops xfs_buffered_write_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; +extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 40289fe6f5b2..756bd3ca8e00 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -29,6 +29,7 @@ #include "xfs_xattr.h" #include "xfs_file.h" #include "xfs_bmap.h" +#include "xfs_zone_alloc.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -298,14 +299,14 @@ xfs_vn_create( return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL); } -STATIC int +STATIC struct dentry * xfs_vn_mkdir( struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL); + return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL)); } STATIC struct dentry * @@ -854,6 +855,7 @@ xfs_setattr_size( uint lock_flags = 0; uint resblks = 0; bool did_zeroing = false; + struct xfs_zone_alloc_ctx ac = { }; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); ASSERT(S_ISREG(inode->i_mode)); @@ -890,6 +892,28 @@ xfs_setattr_size( inode_dio_wait(inode); /* + * Normally xfs_zoned_space_reserve is supposed to be called outside the + * IOLOCK. For truncate we can't do that since ->setattr is called with + * it already held by the VFS. So for now chicken out and try to + * allocate space under it. + * + * To avoid deadlocks this means we can't block waiting for space, which + * can lead to spurious -ENOSPC if there are no directly available + * blocks. We mitigate this a bit by allowing zeroing to dip into the + * reserved pool, but eventually the VFS calling convention needs to + * change. + */ + if (xfs_is_zoned_inode(ip)) { + error = xfs_zoned_space_reserve(ip, 1, + XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); + if (error) { + if (error == -EAGAIN) + return -ENOSPC; + return error; + } + } + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a @@ -902,11 +926,14 @@ xfs_setattr_size( if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = xfs_zero_range(ip, oldsize, newsize - oldsize, - &did_zeroing); + &ac, &did_zeroing); } else { - error = xfs_truncate_page(ip, newsize, &did_zeroing); + error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing); } + if (xfs_is_zoned_inode(ip)) + xfs_zoned_space_unreserve(ip, &ac); + if (error) return error; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f8851ff835de..6493bdb57351 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -20,6 +20,7 @@ #include "xfs_sysfs.h" #include "xfs_sb.h" #include "xfs_health.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_log_ticket_cache; @@ -3540,6 +3541,9 @@ xlog_force_shutdown( spin_unlock(&log->l_icloglock); wake_up_var(&log->l_opstate); + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp)) + xfs_zoned_wake_all(log->l_mp); + return log_error; } diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 6ed485ff2756..15d410d16bb2 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -173,6 +173,10 @@ xfs_warn_experimental( .opstate = XFS_OPSTATE_WARNED_METADIR, .name = "metadata directory tree", }, + [XFS_EXPERIMENTAL_ZONED] = { + .opstate = XFS_OPSTATE_WARNED_ZONED, + .name = "zoned RT device", + }, }; ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX); BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX); diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 7fb36ced9df7..a92a4d09c8e9 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -99,6 +99,7 @@ enum xfs_experimental_feat { XFS_EXPERIMENTAL_EXCHRANGE, XFS_EXPERIMENTAL_PPTR, XFS_EXPERIMENTAL_METADIR, + XFS_EXPERIMENTAL_ZONED, XFS_EXPERIMENTAL_MAX, }; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b69356582b86..00b53f479ece 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -40,6 +40,7 @@ #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" #include "scrub/stats.h" +#include "xfs_zone_alloc.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -185,7 +186,7 @@ xfs_readsb( */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, &bp, buf_ops); + BTOBB(sector_size), &bp, buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); @@ -413,7 +414,7 @@ xfs_check_sizes( } error = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), &bp, NULL); if (error) { xfs_warn(mp, "last sector read failed"); return error; @@ -430,7 +431,7 @@ xfs_check_sizes( } error = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSB_TO_BB(mp, 1), &bp, NULL); if (error) { xfs_warn(mp, "log device read failed"); return error; @@ -461,22 +462,38 @@ xfs_mount_reset_sbqflags( return xfs_sync_sb(mp, false); } +static const char *const xfs_free_pool_name[] = { + [XC_FREE_BLOCKS] = "free blocks", + [XC_FREE_RTEXTENTS] = "free rt extents", + [XC_FREE_RTAVAILABLE] = "available rt extents", +}; + uint64_t -xfs_default_resblks(xfs_mount_t *mp) +xfs_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) { - uint64_t resblks; - - /* - * We default to 5% or 8192 fsbs of space reserved, whichever is - * smaller. This is intended to cover concurrent allocation - * transactions when we initially hit enospc. These each require a 4 - * block reservation. Hence by default we cover roughly 2000 concurrent - * allocation reservations. - */ - resblks = mp->m_sb.sb_dblocks; - do_div(resblks, 20); - resblks = min_t(uint64_t, resblks, 8192); - return resblks; + switch (ctr) { + case XC_FREE_BLOCKS: + /* + * Default to 5% or 8192 FSBs of space reserved, whichever is + * smaller. + * + * This is intended to cover concurrent allocation transactions + * when we initially hit ENOSPC. These each require a 4 block + * reservation. Hence by default we cover roughly 2000 + * concurrent allocation reservations. + */ + return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL); + case XC_FREE_RTEXTENTS: + case XC_FREE_RTAVAILABLE: + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + return xfs_zoned_default_resblks(mp, ctr); + return 0; + default: + ASSERT(0); + return 0; + } } /* Ensure the summary counts are correct. */ @@ -543,7 +560,7 @@ xfs_check_summary_counts( * If we're mounting the rt volume after recovering the log, recompute * frextents from the rtbitmap file to fix the inconsistency. */ - if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) { error = xfs_rtalloc_reinit_frextents(mp); if (error) return error; @@ -678,6 +695,7 @@ xfs_mountfs( uint quotamount = 0; uint quotaflags = 0; int error = 0; + int i; xfs_sb_mount_common(mp, sbp); @@ -747,27 +765,15 @@ xfs_mountfs( /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; - super_set_sysfs_name_id(mp->m_super); - - error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, - NULL, mp->m_super->s_id); - if (error) - goto out; - - error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, - &mp->m_kobj, "stats"); + error = xfs_mount_sysfs_init(mp); if (error) - goto out_remove_sysfs; + goto out_remove_scrub_stats; xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); - error = xfs_error_sysfs_init(mp); - if (error) - goto out_remove_scrub_stats; - error = xfs_errortag_init(mp); if (error) - goto out_remove_error_sysfs; + goto out_remove_sysfs; error = xfs_uuid_mount(mp); if (error) @@ -1031,6 +1037,12 @@ xfs_mountfs( if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) xfs_log_clean(mp); + if (xfs_has_zoned(mp)) { + error = xfs_mount_zones(mp); + if (error) + goto out_rtunmount; + } + /* * Complete the quota initialisation, post-log-replay component. */ @@ -1046,22 +1058,28 @@ xfs_mountfs( * privileged transactions. This is needed so that transaction * space required for critical operations can dip into this pool * when at ENOSPC. This is needed for operations like create with - * attr, unwritten extent conversion at ENOSPC, etc. Data allocations - * are not allowed to use this reserved space. + * attr, unwritten extent conversion at ENOSPC, garbage collection + * etc. Data allocations are not allowed to use this reserved space. * * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); - if (error) - xfs_warn(mp, - "Unable to allocate reserve blocks. Continuing without reserve pool."); + for (i = 0; i < XC_FREE_NR; i++) { + error = xfs_reserve_blocks(mp, i, + xfs_default_resblks(mp, i)); + if (error) + xfs_warn(mp, +"Unable to allocate reserve blocks. Continuing without reserve pool for %s.", + xfs_free_pool_name[i]); + } /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) goto out_agresv; + + xfs_zone_gc_start(mp); } return 0; @@ -1069,6 +1087,8 @@ xfs_mountfs( out_agresv: xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); out_rtunmount: xfs_rtunmount_inodes(mp); out_rele_rip: @@ -1116,13 +1136,10 @@ xfs_mountfs( xfs_uuid_unmount(mp); out_remove_errortag: xfs_errortag_del(mp); - out_remove_error_sysfs: - xfs_error_sysfs_del(mp); + out_remove_sysfs: + xfs_mount_sysfs_del(mp); out_remove_scrub_stats: xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - out_remove_sysfs: - xfs_sysfs_del(&mp->m_kobj); out: return error; } @@ -1148,8 +1165,12 @@ xfs_unmountfs( xfs_inodegc_flush(mp); xfs_blockgc_stop(mp); + if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate)) + xfs_zone_gc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); if (mp->m_metadirip) @@ -1173,7 +1194,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - error = xfs_reserve_blocks(mp, 0); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); @@ -1195,10 +1216,8 @@ xfs_unmountfs( xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); xfs_errortag_del(mp); - xfs_error_sysfs_del(mp); xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - xfs_sysfs_del(&mp->m_kobj); + xfs_mount_sysfs_del(mp); } /* @@ -1220,52 +1239,67 @@ xfs_fs_writable( return true; } +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +uint64_t +xfs_freecounter_unavailable( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + if (ctr != XC_FREE_BLOCKS) + return 0; + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + void xfs_add_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta) { - bool has_resv_pool = (counter == &mp->m_fdblocks); + struct xfs_freecounter *counter = &mp->m_free[ctr]; uint64_t res_used; /* * If the reserve pool is depleted, put blocks back into it first. * Most of the time the pool is full. */ - if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) { - percpu_counter_add(counter, delta); + if (likely(counter->res_avail == counter->res_total)) { + percpu_counter_add(&counter->count, delta); return; } spin_lock(&mp->m_sb_lock); - res_used = mp->m_resblks - mp->m_resblks_avail; + res_used = counter->res_total - counter->res_avail; if (res_used > delta) { - mp->m_resblks_avail += delta; + counter->res_avail += delta; } else { delta -= res_used; - mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(counter, delta); + counter->res_avail = counter->res_total; + percpu_counter_add(&counter->count, delta); } spin_unlock(&mp->m_sb_lock); } + +/* Adjust in-core free blocks or RT extents. */ int xfs_dec_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta, bool rsvd) { - int64_t lcounter; - uint64_t set_aside = 0; + struct xfs_freecounter *counter = &mp->m_free[ctr]; s32 batch; - bool has_resv_pool; - ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); - has_resv_pool = (counter == &mp->m_fdblocks); - if (rsvd) - ASSERT(has_resv_pool); + ASSERT(ctr < XC_FREE_NR); /* * Taking blocks away, need to be more accurate the closer we @@ -1275,7 +1309,7 @@ xfs_dec_freecounter( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1292,34 +1326,34 @@ xfs_dec_freecounter( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - if (has_resv_pool) - set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(counter, -((int64_t)delta), batch); - if (__percpu_counter_compare(counter, set_aside, - XFS_FDBLOCKS_BATCH) >= 0) { - /* we had space! */ - return 0; - } - - /* - * lock up the sb for dipping into reserves before releasing the space - * that took us to ENOSPC. - */ - spin_lock(&mp->m_sb_lock); - percpu_counter_add(counter, delta); - if (!has_resv_pool || !rsvd) - goto fdblocks_enospc; - - lcounter = (long long)mp->m_resblks_avail - delta; - if (lcounter >= 0) { - mp->m_resblks_avail = lcounter; + percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch); + if (__percpu_counter_compare(&counter->count, + xfs_freecounter_unavailable(mp, ctr), + XFS_FDBLOCKS_BATCH) < 0) { + /* + * Lock up the sb for dipping into reserves before releasing the + * space that took us to ENOSPC. + */ + spin_lock(&mp->m_sb_lock); + percpu_counter_add(&counter->count, delta); + if (!rsvd) + goto fdblocks_enospc; + if (delta > counter->res_avail) { + if (ctr == XC_FREE_BLOCKS) + xfs_warn_once(mp, +"Reserve blocks depleted! Consider increasing reserve pool size."); + goto fdblocks_enospc; + } + counter->res_avail -= delta; + trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); - return 0; } - xfs_warn_once(mp, -"Reserve blocks depleted! Consider increasing reserve pool size."); + + /* we had space! */ + return 0; fdblocks_enospc: + trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); return -ENOSPC; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index fbed172d6770..799b84220ebb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -98,11 +98,41 @@ struct xfs_groups { uint8_t blklog; /* + * Zoned devices can have gaps beyond the usable capacity of a zone and + * the end in the LBA/daddr address space. In other words, the hardware + * equivalent to the RT groups already takes care of the power of 2 + * alignment for us. In this case the sparse FSB/RTB address space maps + * 1:1 to the device address space. + */ + bool has_daddr_gaps; + + /* * Mask to extract the group-relative block number from a FSB. * For a pre-rtgroups filesystem we pretend to have one very large * rtgroup, so this mask must be 64-bit. */ uint64_t blkmask; + + /* + * Start of the first group in the device. This is used to support a + * RT device following the data device on the same block device for + * SMR hard drives. + */ + xfs_fsblock_t start_fsb; +}; + +struct xfs_freecounter { + /* free blocks for general use: */ + struct percpu_counter count; + + /* total reserved blocks: */ + uint64_t res_total; + + /* available reserved blocks: */ + uint64_t res_avail; + + /* reserved blks @ remount,ro: */ + uint64_t res_saved; }; /* @@ -198,6 +228,7 @@ typedef struct xfs_mount { bool m_fail_unmount; bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ + unsigned int m_max_open_zones; /* * Bitsets of per-fs metadata that have been checked and/or are sick. @@ -222,8 +253,8 @@ typedef struct xfs_mount { spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ - struct percpu_counter m_fdblocks; /* free block counter */ - struct percpu_counter m_frextents; /* free rt extent counter */ + + struct xfs_freecounter m_free[XC_FREE_NR]; /* * Count of data device blocks reserved for delayed allocations, @@ -245,10 +276,8 @@ typedef struct xfs_mount { atomic64_t m_allocbt_blks; struct xfs_groups m_groups[XG_TYPE_MAX]; - uint64_t m_resblks; /* total reserved blocks */ - uint64_t m_resblks_avail;/* available reserved blocks */ - uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct xfs_zone_info *m_zone_info; /* zone allocator information */ struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; @@ -258,10 +287,16 @@ typedef struct xfs_mount { #ifdef CONFIG_XFS_ONLINE_SCRUB_STATS struct xchk_stats *m_scrub_stats; #endif + struct xfs_kobj m_zoned_kobj; xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ atomic_t m_rtgrotor; /* last rtgroup rtpicked */ + struct mutex m_metafile_resv_lock; + uint64_t m_metafile_resv_target; + uint64_t m_metafile_resv_used; + uint64_t m_metafile_resv_avail; + /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker *m_inodegc_shrinker; /* @@ -336,8 +371,10 @@ typedef struct xfs_mount { #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ #define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */ +#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */ /* Mount features */ +#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ @@ -392,6 +429,8 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) __XFS_HAS_FEAT(metadir, METADIR) +__XFS_HAS_FEAT(zoned, ZONED) +__XFS_HAS_FEAT(nolifetime, NOLIFETIME) static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) { @@ -402,7 +441,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) static inline bool xfs_has_rtsb(const struct xfs_mount *mp) { /* all rtgroups filesystems with an rt section have an rtsb */ - return xfs_has_rtgroups(mp) && xfs_has_realtime(mp); + return xfs_has_rtgroups(mp) && + xfs_has_realtime(mp) && + !xfs_has_zoned(mp); } static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp) @@ -417,6 +458,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp) xfs_has_reflink(mp); } +static inline bool xfs_has_nonzoned(const struct xfs_mount *mp) +{ + return !xfs_has_zoned(mp); +} + /* * Some features are always on for v5 file systems, allow the compiler to * eliminiate dead code when building without v4 support. @@ -520,6 +566,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_METADIR 17 /* Filesystem should use qflags to determine quotaon status */ #define XFS_OPSTATE_RESUMING_QUOTAON 18 +/* Kernel has logged a warning about zoned RT device being used on this fs. */ +#define XFS_OPSTATE_WARNED_ZONED 19 +/* (Zoned) GC is in progress */ +#define XFS_OPSTATE_ZONEGC_RUNNING 20 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -564,6 +614,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp) #endif /* CONFIG_XFS_QUOTA */ __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) +__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING) static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -633,7 +684,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } extern void xfs_uuid_table_free(void); -extern uint64_t xfs_default_resblks(xfs_mount_t *mp); +uint64_t xfs_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); @@ -646,45 +698,74 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 +uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp, + enum xfs_free_counter ctr); + /* - * Estimate the amount of free space that is not available to userspace and is - * not explicitly reserved from the incore fdblocks. This includes: - * - * - The minimum number of blocks needed to support splitting a bmap btree - * - The blocks currently in use by the freespace btrees because they record - * the actual blocks that will fill per-AG metadata space reservations + * Sum up the freecount, but never return negative values. */ -static inline uint64_t -xfs_fdblocks_unavailable( - struct xfs_mount *mp) +static inline s64 xfs_sum_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) { - return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + return percpu_counter_sum_positive(&mp->m_free[ctr].count); } -int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +/* + * Same as above, but does return negative values. Mostly useful for + * special cases like repair and tracing. + */ +static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_sum(&mp->m_free[ctr].count); +} + +/* + * This just provides and estimate without the cpu-local updates, use + * xfs_sum_freecounter for the exact value. + */ +static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_read_positive(&mp->m_free[ctr].count); +} + +static inline int xfs_compare_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, s64 rhs, s32 batch) +{ + return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch); +} + +static inline void xfs_set_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t val) +{ + percpu_counter_set(&mp->m_free[ctr].count, val); +} + +int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta, bool rsvd); -void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta); static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta, bool reserved) { - return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved); + return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved); } static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_fdblocks, delta); + xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta); } static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta) { - return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false); + return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false); } static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_frextents, delta); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta); } extern int xfs_readsb(xfs_mount_t *, int); @@ -706,5 +787,9 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, int64_t ind_delta); +static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) +{ + percpu_counter_add(&mp->m_delalloc_blks, delta); +} #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e1ba5af6250f..417439b58785 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas( * immediately. We only support rtquota if rtgroups are enabled to * avoid problems with older kernels. */ - if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) { + if (mp->m_sb.sb_rextents && + (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) { xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); mp->m_qflags = 0; goto write_changes; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 59f7fc16eb80..cc3b4df88110 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } @@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks( if (isnullstartblock(del.br_startblock)) { xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, - &del); + &del, 0); } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); @@ -1207,15 +1207,9 @@ xfs_reflink_ag_has_free_space( if (!xfs_has_rmapbt(mp)) return 0; if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_rtgroup *rtg; - xfs_rgnumber_t rgno; - - rgno = xfs_rtb_to_rgno(mp, fsb); - rtg = xfs_rtgroup_get(mp, rgno); - if (xfs_metafile_resv_critical(rtg_rmap(rtg))) - error = -ENOSPC; - xfs_rtgroup_put(rtg); - return error; + if (xfs_metafile_resv_critical(mp)) + return -ENOSPC; + return 0; } agno = XFS_FSB_TO_AGNO(mp, fsb); @@ -1538,7 +1532,7 @@ xfs_reflink_zero_posteof( return 0; trace_xfs_zero_eof(ip, isize, pos - isize); - return xfs_zero_range(ip, isize, pos - isize, NULL); + return xfs_zero_range(ip, isize, pos - isize, NULL, NULL); } /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 57bef567e011..6484c596ecea 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -33,6 +33,7 @@ #include "xfs_trace.h" #include "xfs_rtrefcount_btree.h" #include "xfs_reflink.h" +#include "xfs_zone_alloc.h" /* * Return whether there are any free extents in the size range given @@ -663,7 +664,8 @@ xfs_rtunmount_rtg( for (i = 0; i < XFS_RTGI_MAX; i++) xfs_rtginode_irele(&rtg->rtg_inodes[i]); - kvfree(rtg->rtg_rsum_cache); + if (!xfs_has_zoned(rtg_mount(rtg))) + kvfree(rtg->rtg_rsum_cache); } static int @@ -837,7 +839,7 @@ xfs_growfs_rt_init_rtsb( return 0; error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1), - 0, &rtsb_bp); + &rtsb_bp); if (error) return error; @@ -858,6 +860,84 @@ xfs_growfs_rt_init_rtsb( return error; } +static void +xfs_growfs_rt_sb_fields( + struct xfs_trans *tp, + const struct xfs_mount *nmp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, + nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); + if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); + if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, + nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); + if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, + nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); + if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); + if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT, + nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); +} + +static int +xfs_growfs_rt_zoned( + struct xfs_rtgroup *rtg, + xfs_rfsblock_t nrblocks) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_mount *nmp; + struct xfs_trans *tp; + xfs_rtbxlen_t freed_rtx; + int error; + + /* + * Calculate new sb and mount fields for this round. Also ensure the + * rtg_extents value is uptodate as the rtbitmap code relies on it. + */ + nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, + mp->m_sb.sb_rextsize); + if (!nmp) + return -ENOMEM; + freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; + + xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg), + nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents); + + error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp); + if (error) + goto out_free; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + + xfs_growfs_rt_sb_fields(tp, nmp); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx); + + error = xfs_trans_commit(tp); + if (error) + goto out_free; + + /* + * Ensure the mount RT feature flag is now set, and compute new + * maxlevels for rt btrees. + */ + mp->m_features |= XFS_FEAT_REALTIME; + xfs_rtrmapbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); + xfs_zoned_add_available(mp, freed_rtx); +out_free: + kfree(nmp); + return error; +} + static int xfs_growfs_rt_bmblock( struct xfs_rtgroup *rtg, @@ -943,24 +1023,7 @@ xfs_growfs_rt_bmblock( /* * Update superblock fields. */ - if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE, - nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); - if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS, - nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); - if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS, - nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); - if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS, - nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); - if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG, - nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); - if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT, - nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); + xfs_growfs_rt_sb_fields(args.tp, nmp); /* * Free the new extent. @@ -1127,6 +1190,11 @@ xfs_growfs_rtg( goto out_rele; } + if (xfs_has_zoned(mp)) { + error = xfs_growfs_rt_zoned(rtg, nrblocks); + goto out_rele; + } + error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks); if (error) goto out_rele; @@ -1144,10 +1212,8 @@ xfs_growfs_rtg( goto out_error; } - if (old_rsum_cache) - kvfree(old_rsum_cache); - xfs_rtgroup_rele(rtg); - return 0; + kvfree(old_rsum_cache); + goto out_rele; out_error: /* @@ -1195,6 +1261,22 @@ xfs_growfs_check_rtgeom( if (min_logfsbs > mp->m_sb.sb_logblocks) return -EINVAL; + + if (xfs_has_zoned(mp)) { + uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks; + uint32_t rem; + + if (rextsize != 1) + return -EINVAL; + div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem); + if (rem) { + xfs_warn(mp, +"new RT volume size (%lld) not aligned to RT group size (%d)", + mp->m_sb.sb_rblocks, gblocks); + return -EINVAL; + } + } + return 0; } @@ -1249,6 +1331,35 @@ xfs_grow_last_rtg( } /* + * Read in the last block of the RT device to make sure it is accessible. + */ +static int +xfs_rt_check_size( + struct xfs_mount *mp, + xfs_rfsblock_t last_block) +{ + xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block); + struct xfs_buf *bp; + int error; + + if (XFS_BB_TO_FSB(mp, daddr) != last_block) { + xfs_warn(mp, "RT device size overflow: %llu != %llu", + XFS_BB_TO_FSB(mp, daddr), last_block); + return -EFBIG; + } + + error = xfs_buf_read_uncached(mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr, + XFS_FSB_TO_BB(mp, 1), &bp, NULL); + if (error) + xfs_warn(mp, "cannot read last RT device sector (%lld)", + last_block); + else + xfs_buf_relse(bp); + return error; +} + +/* * Grow the realtime area of the filesystem. */ int @@ -1259,7 +1370,6 @@ xfs_growfs_rt( xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount; xfs_rgnumber_t new_rgcount = 1; xfs_rgnumber_t rgno; - struct xfs_buf *bp; xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; int error; @@ -1302,15 +1412,10 @@ xfs_growfs_rt( error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); if (error) goto out_unlock; - /* - * Read in the last block of the device, make sure it exists. - */ - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, in->newblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + + error = xfs_rt_check_size(mp, in->newblocks - 1); if (error) goto out_unlock; - xfs_buf_relse(bp); /* * Calculate new parameters. These are the final values to be reached. @@ -1376,8 +1481,7 @@ xfs_growfs_rt( error = error2; /* Reset the rt metadata btree space reservations. */ - xfs_rt_resv_free(mp); - error2 = xfs_rt_resv_init(mp); + error2 = xfs_metafile_resv_init(mp); if (error2 && error2 != -ENOSPC) error = error2; } @@ -1407,7 +1511,7 @@ xfs_rtmount_readsb( /* m_blkbb_log is not set up yet */ error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR, - mp->m_sb.sb_blocksize >> BBSHIFT, 0, &bp, + mp->m_sb.sb_blocksize >> BBSHIFT, &bp, &xfs_rtsb_buf_ops); if (error) { xfs_warn(mp, "rt sb validate failed with error %d.", error); @@ -1444,10 +1548,6 @@ int /* error */ xfs_rtmount_init( struct xfs_mount *mp) /* file system mount structure */ { - struct xfs_buf *bp; /* buffer for last block of subvolume */ - xfs_daddr_t d; /* address of last block of subvolume */ - int error; - if (mp->m_sb.sb_rblocks == 0) return 0; if (mp->m_rtdev_targp == NULL) { @@ -1458,25 +1558,7 @@ xfs_rtmount_init( mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels); - /* - * Check that the realtime section is an ok size. - */ - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { - xfs_warn(mp, "realtime mount -- %llu != %llu", - (unsigned long long) XFS_BB_TO_FSB(mp, d), - (unsigned long long) mp->m_sb.sb_rblocks); - return -EFBIG; - } - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); - if (error) { - xfs_warn(mp, "realtime device size check failed"); - return error; - } - xfs_buf_relse(bp); - return 0; + return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1); } static int @@ -1519,50 +1601,10 @@ xfs_rtalloc_reinit_frextents( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_frextents = val; spin_unlock(&mp->m_sb_lock); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents); return 0; } -/* Free space reservations for rt metadata inodes. */ -void -xfs_rt_resv_free( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - unsigned int i; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - for (i = 0; i < XFS_RTGI_MAX; i++) - xfs_metafile_resv_free(rtg->rtg_inodes[i]); - } -} - -/* Reserve space for rt metadata inodes' space expansion. */ -int -xfs_rt_resv_init( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - xfs_filblks_t ask; - int error = 0; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - int err2; - - ask = xfs_rtrmapbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); - if (err2 && !error) - error = err2; - - ask = xfs_rtrefcountbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); - if (err2 && !error) - error = err2; - } - - return error; -} - /* * Read in the bmbt of an rt metadata inode so that we never have to load them * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use @@ -1613,6 +1655,8 @@ xfs_rtmount_rtg( } } + if (xfs_has_zoned(mp)) + return 0; return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks); } @@ -2097,6 +2141,8 @@ xfs_bmap_rtalloc( ap->datatype & XFS_ALLOC_INITIAL_USER_DATA; int error; + ASSERT(!xfs_has_zoned(ap->tp->t_mountp)); + retry: error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign); if (error) diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 0d95b29092c9..78a690b489ed 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -34,9 +34,6 @@ int /* error */ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ -void xfs_rt_resv_free(struct xfs_mount *mp); -int xfs_rt_resv_init(struct xfs_mount *mp); - /* * Grow the realtime area of the filesystem. */ @@ -65,8 +62,6 @@ xfs_rtmount_init( } # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) -# define xfs_rt_resv_free(mp) ((void)0) -# define xfs_rt_resv_init(mp) (0) static inline int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0055066fb1d9..53944cc7af24 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -46,6 +46,7 @@ #include "xfs_exchmaps_item.h" #include "xfs_parent.h" #include "xfs_rtalloc.h" +#include "xfs_zone_alloc.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" @@ -109,7 +110,8 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, + Opt_lifetime, Opt_nolifetime, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -154,6 +156,9 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_u32("max_open_zones", Opt_max_open_zones), + fsparam_flag("lifetime", Opt_lifetime), + fsparam_flag("nolifetime", Opt_nolifetime), {} }; @@ -182,6 +187,7 @@ xfs_fs_show_options( { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, { XFS_FEAT_DAX_NEVER, ",dax=never" }, + { XFS_FEAT_NOLIFETIME, ",nolifetime" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); @@ -233,6 +239,9 @@ xfs_fs_show_options( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); + if (mp->m_max_open_zones) + seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + return 0; } @@ -533,7 +542,15 @@ xfs_setup_devices( if (error) return error; } - if (mp->m_rtdev_targp) { + + if (mp->m_sb.sb_rtstart) { + if (mp->m_rtdev_targp) { + xfs_warn(mp, + "can't use internal and external rtdev at the same time"); + return -EINVAL; + } + mp->m_rtdev_targp = mp->m_ddev_targp; + } else if (mp->m_rtname) { error = xfs_setsize_buftarg(mp->m_rtdev_targp, mp->m_sb.sb_sectsize); if (error) @@ -757,7 +774,7 @@ xfs_mount_free( { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_rtdev_targp); if (mp->m_ddev_targp) xfs_free_buftarg(mp->m_ddev_targp); @@ -814,6 +831,7 @@ xfs_fs_sync_fs( if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { xfs_inodegc_stop(mp); xfs_blockgc_stop(mp); + xfs_zone_gc_stop(mp); } return 0; @@ -834,10 +852,12 @@ xfs_statfs_data( struct kstatfs *st) { int64_t fdblocks = - percpu_counter_sum(&mp->m_fdblocks); + xfs_sum_freecounter(mp, XC_FREE_BLOCKS); /* make sure st->f_bfree does not underflow */ - st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp)); + st->f_bfree = max(0LL, + fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS)); + /* * sb_dblocks can change during growfs, but nothing cares about reporting * the old or new value during growfs. @@ -856,8 +876,9 @@ xfs_statfs_rt( struct kstatfs *st) { st->f_bfree = xfs_rtbxlen_to_blen(mp, - percpu_counter_sum_positive(&mp->m_frextents)); - st->f_blocks = mp->m_sb.sb_rblocks; + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp, + mp->m_free[XC_FREE_RTEXTENTS].res_total); } static void @@ -922,24 +943,32 @@ xfs_fs_statfs( } STATIC void -xfs_save_resvblks(struct xfs_mount *mp) +xfs_save_resvblks( + struct xfs_mount *mp) { - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, 0); + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) { + mp->m_free[i].res_saved = mp->m_free[i].res_total; + xfs_reserve_blocks(mp, i, 0); + } } STATIC void -xfs_restore_resvblks(struct xfs_mount *mp) +xfs_restore_resvblks( + struct xfs_mount *mp) { - uint64_t resblks; + uint64_t resblks; + enum xfs_free_counter i; - if (mp->m_resblks_save) { - resblks = mp->m_resblks_save; - mp->m_resblks_save = 0; - } else - resblks = xfs_default_resblks(mp); - - xfs_reserve_blocks(mp, resblks); + for (i = 0; i < XC_FREE_NR; i++) { + if (mp->m_free[i].res_saved) { + resblks = mp->m_free[i].res_saved; + mp->m_free[i].res_saved = 0; + } else + resblks = xfs_default_resblks(mp, i); + xfs_reserve_blocks(mp, i, resblks); + } } /* @@ -976,6 +1005,7 @@ xfs_fs_freeze( if (ret && !xfs_is_readonly(mp)) { xfs_blockgc_start(mp); xfs_inodegc_start(mp); + xfs_zone_gc_start(mp); } return ret; @@ -997,6 +1027,7 @@ xfs_fs_unfreeze( * filesystem. */ if (!xfs_is_readonly(mp)) { + xfs_zone_gc_start(mp); xfs_blockgc_start(mp); xfs_inodegc_start(mp); } @@ -1058,6 +1089,19 @@ xfs_finish_flags( return -EINVAL; } + if (!xfs_has_zoned(mp)) { + if (mp->m_max_open_zones) { + xfs_warn(mp, +"max_open_zones mount option only supported on zoned file systems."); + return -EINVAL; + } + if (mp->m_features & XFS_FEAT_NOLIFETIME) { + xfs_warn(mp, +"nolifetime mount option only supported on zoned file systems."); + return -EINVAL; + } + } + return 0; } @@ -1065,7 +1109,8 @@ static int xfs_init_percpu_counters( struct xfs_mount *mp) { - int error; + int error; + int i; error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); if (error) @@ -1075,30 +1120,29 @@ xfs_init_percpu_counters( if (error) goto free_icount; - error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); - if (error) - goto free_ifree; - error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); if (error) - goto free_fdblocks; + goto free_ifree; error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); if (error) goto free_delalloc; - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); - if (error) - goto free_delalloc_rt; + for (i = 0; i < XC_FREE_NR; i++) { + error = percpu_counter_init(&mp->m_free[i].count, 0, + GFP_KERNEL); + if (error) + goto free_freecounters; + } return 0; -free_delalloc_rt: +free_freecounters: + while (--i > 0) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: percpu_counter_destroy(&mp->m_delalloc_blks); -free_fdblocks: - percpu_counter_destroy(&mp->m_fdblocks); free_ifree: percpu_counter_destroy(&mp->m_ifree); free_icount: @@ -1112,24 +1156,28 @@ xfs_reinit_percpu_counters( { percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); - percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks); + if (!xfs_has_zoned(mp)) + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + mp->m_sb.sb_frextents); } static void xfs_destroy_percpu_counters( struct xfs_mount *mp) { + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); - percpu_counter_destroy(&mp->m_fdblocks); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); percpu_counter_destroy(&mp->m_delalloc_rtextents); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); - percpu_counter_destroy(&mp->m_frextents); } static int @@ -1210,6 +1258,18 @@ xfs_fs_shutdown( xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); } +static int +xfs_fs_show_stats( + struct seq_file *m, + struct dentry *root) +{ + struct xfs_mount *mp = XFS_M(root->d_sb); + + if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT)) + xfs_zoned_show_stats(m, mp); + return 0; +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1224,6 +1284,7 @@ static const struct super_operations xfs_super_operations = { .nr_cached_objects = xfs_fs_nr_cached_objects, .free_cached_objects = xfs_fs_free_cached_objects, .shutdown = xfs_fs_shutdown, + .show_stats = xfs_fs_show_stats, }; static int @@ -1436,6 +1497,15 @@ xfs_fs_parse_param( xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; + case Opt_max_open_zones: + parsing_mp->m_max_open_zones = result.uint_32; + return 0; + case Opt_lifetime: + parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME; + return 0; + case Opt_nolifetime: + parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1780,8 +1850,17 @@ xfs_fs_fill_super( mp->m_features &= ~XFS_FEAT_DISCARD; } - if (xfs_has_metadir(mp)) + if (xfs_has_zoned(mp)) { + if (!xfs_has_metadir(mp)) { + xfs_alert(mp, + "metadir feature required for zoned realtime devices."); + error = -EINVAL; + goto out_filestream_unmount; + } + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED); + } else if (xfs_has_metadir(mp)) { xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); + } if (xfs_has_reflink(mp)) { if (xfs_has_realtime(mp) && @@ -1793,6 +1872,13 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_zoned(mp)) { + xfs_alert(mp, + "reflink not compatible with zoned RT device!"); + error = -EINVAL; + goto out_filestream_unmount; + } + if (xfs_globals.always_cow) { xfs_info(mp, "using DEBUG-only always_cow mode."); mp->m_always_cow = true; @@ -1917,6 +2003,9 @@ xfs_remount_rw( /* Re-enable the background inode inactivation worker. */ xfs_inodegc_start(mp); + /* Restart zone reclaim */ + xfs_zone_gc_start(mp); + return 0; } @@ -1961,6 +2050,9 @@ xfs_remount_ro( */ xfs_inodegc_stop(mp); + /* Stop zone reclaim */ + xfs_zone_gc_stop(mp); + /* Free the per-AG metadata reservation pool. */ xfs_fs_unreserve_ag_blocks(mp); @@ -2082,6 +2174,7 @@ xfs_init_fs_context( for (i = 0; i < XG_TYPE_MAX; i++) xa_init(&mp->m_groups[i].xa); mutex_init(&mp->m_growlock); + mutex_init(&mp->m_metafile_resv_lock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_kobj.kobject.kset = xfs_kset; @@ -2122,7 +2215,8 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | + FS_LBS, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 60cb5318fdae..b0857e3c1270 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -13,6 +13,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" +#include "xfs_zones.h" struct xfs_sysfs_attr { struct attribute attr; @@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_mp); -const struct kobj_type xfs_mp_ktype = { +static const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, @@ -701,45 +702,103 @@ out_error: return error; } +static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj) +{ + return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj); +} + +static ssize_t +max_open_zones_show( + struct kobject *kobj, + char *buf) +{ + /* only report the open zones available for user data */ + return sysfs_emit(buf, "%u\n", + zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES); +} +XFS_SYSFS_ATTR_RO(max_open_zones); + +static struct attribute *xfs_zoned_attrs[] = { + ATTR_LIST(max_open_zones), + NULL, +}; +ATTRIBUTE_GROUPS(xfs_zoned); + +static const struct kobj_type xfs_zoned_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_zoned_groups, +}; + int -xfs_error_sysfs_init( +xfs_mount_sysfs_init( struct xfs_mount *mp) { int error; + super_set_sysfs_name_id(mp->m_super); + + /* .../xfs/<dev>/ */ + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, + NULL, mp->m_super->s_id); + if (error) + return error; + + /* .../xfs/<dev>/stats/ */ + error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, + &mp->m_kobj, "stats"); + if (error) + goto out_remove_fsdir; + /* .../xfs/<dev>/error/ */ error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, &mp->m_kobj, "error"); if (error) - return error; + goto out_remove_stats_dir; + /* .../xfs/<dev>/error/fail_at_unmount */ error = sysfs_create_file(&mp->m_error_kobj.kobject, ATTR_LIST(fail_at_unmount)); if (error) - goto out_error; + goto out_remove_error_dir; /* .../xfs/<dev>/error/metadata/ */ error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, "metadata", &mp->m_error_meta_kobj, xfs_error_meta_init); if (error) - goto out_error; + goto out_remove_error_dir; + + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) { + /* .../xfs/<dev>/zoned/ */ + error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype, + &mp->m_kobj, "zoned"); + if (error) + goto out_remove_error_dir; + } return 0; -out_error: +out_remove_error_dir: xfs_sysfs_del(&mp->m_error_kobj); +out_remove_stats_dir: + xfs_sysfs_del(&mp->m_stats.xs_kobj); +out_remove_fsdir: + xfs_sysfs_del(&mp->m_kobj); return error; } void -xfs_error_sysfs_del( +xfs_mount_sysfs_del( struct xfs_mount *mp) { struct xfs_error_cfg *cfg; int i, j; + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + xfs_sysfs_del(&mp->m_zoned_kobj); + for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { cfg = &mp->m_error_cfg[i][j]; @@ -749,6 +808,8 @@ xfs_error_sysfs_del( } xfs_sysfs_del(&mp->m_error_meta_kobj); xfs_sysfs_del(&mp->m_error_kobj); + xfs_sysfs_del(&mp->m_stats.xs_kobj); + xfs_sysfs_del(&mp->m_kobj); } struct xfs_error_cfg * diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 148893ebfdef..1622fe80ad3e 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -7,7 +7,6 @@ #ifndef __XFS_SYSFS_H__ #define __XFS_SYSFS_H__ -extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ extern const struct kobj_type xfs_dbg_ktype; /* debug */ extern const struct kobj_type xfs_log_ktype; /* xlog */ extern const struct kobj_type xfs_stats_ktype; /* stats */ @@ -53,7 +52,7 @@ xfs_sysfs_del( wait_for_completion(&kobj->complete); } -int xfs_error_sysfs_init(struct xfs_mount *mp); -void xfs_error_sysfs_del(struct xfs_mount *mp); +int xfs_mount_sysfs_init(struct xfs_mount *mp); +void xfs_mount_sysfs_del(struct xfs_mount *mp); #endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 8f530e69c18a..a60556dbd172 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -49,6 +49,8 @@ #include "xfs_metafile.h" #include "xfs_metadir.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bfc2f1249022..e56ba1963160 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -102,6 +102,7 @@ struct xfs_rmap_intent; struct xfs_refcount_intent; struct xfs_metadir_update; struct xfs_rtgroup; +struct xfs_open_zone; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -265,6 +266,152 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab); DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag); DEFINE_GROUP_REF_EVENT(xfs_group_rele); +#ifdef CONFIG_XFS_RT +DECLARE_EVENT_CLASS(xfs_zone_class, + TP_PROTO(struct xfs_rtgroup *rtg), + TP_ARGS(rtg), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, nr_open) + ), + TP_fast_assign( + struct xfs_mount *mp = rtg_mount(rtg); + + __entry->dev = mp->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->nr_open = mp->m_zone_info->zi_nr_open_zones; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->nr_open) +); + +#define DEFINE_ZONE_EVENT(name) \ +DEFINE_EVENT(xfs_zone_class, name, \ + TP_PROTO(struct xfs_rtgroup *rtg), \ + TP_ARGS(rtg)) +DEFINE_ZONE_EVENT(xfs_zone_emptied); +DEFINE_ZONE_EVENT(xfs_zone_full); +DEFINE_ZONE_EVENT(xfs_zone_opened); +DEFINE_ZONE_EVENT(xfs_zone_reset); +DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened); + +TRACE_EVENT(xfs_zone_free_blocks, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(rtg, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->rgbno, + __entry->len) +); + +DECLARE_EVENT_CLASS(xfs_zone_alloc_class, + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(oz, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, written) + __field(xfs_rgblock_t, write_pointer) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(oz->oz_rtg); + __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks; + __entry->written = oz->oz_written; + __entry->write_pointer = oz->oz_write_pointer; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->written, + __entry->write_pointer, + __entry->rgbno, + __entry->len) +); + +#define DEFINE_ZONE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_zone_alloc_class, name, \ + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \ + xfs_extlen_t len), \ + TP_ARGS(oz, rgbno, len)) +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); + +TRACE_EVENT(xfs_zone_gc_select_victim, + TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket), + TP_ARGS(rtg, bucket), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, bucket) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->bucket = bucket; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->bucket) +); + +TRACE_EVENT(xfs_zones_mount, + TP_PROTO(struct xfs_mount *mp), + TP_ARGS(mp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgcount) + __field(uint32_t, blocks) + __field(unsigned int, max_open_zones) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rgcount = mp->m_sb.sb_rgcount; + __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks; + __entry->max_open_zones = mp->m_max_open_zones; + ), + TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgcount, + __entry->blocks, + __entry->max_open_zones) +); +#endif /* CONFIG_XFS_RT */ + TRACE_EVENT(xfs_inodegc_worker, TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), TP_ARGS(mp, shrinker_hits), @@ -545,6 +692,10 @@ DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_drain_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); +DEFINE_BUF_EVENT(xfs_buf_backing_folio); +DEFINE_BUF_EVENT(xfs_buf_backing_kmem); +DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc); +DEFINE_BUF_EVENT(xfs_buf_backing_fallback); /* not really buffer traces, but the buf provides useful information */ DEFINE_BUF_EVENT(xfs_btree_corrupt); @@ -1596,6 +1747,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); +DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), @@ -3983,6 +4135,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); @@ -5606,11 +5759,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup); /* metadata inode space reservations */ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), - TP_ARGS(ip, len), + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), + TP_ARGS(mp, len), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_ino_t, ino) __field(unsigned long long, freeblks) __field(unsigned long long, reserved) __field(unsigned long long, asked) @@ -5618,19 +5770,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, __field(unsigned long long, len) ), TP_fast_assign( - struct xfs_mount *mp = ip->i_mount; - __entry->dev = mp->m_super->s_dev; - __entry->ino = ip->i_ino; - __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks); - __entry->reserved = ip->i_delayed_blks; - __entry->asked = ip->i_meta_resv_asked; - __entry->used = ip->i_nblocks; + __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + __entry->reserved = mp->m_metafile_resv_avail; + __entry->asked = mp->m_metafile_resv_target; + __entry->used = mp->m_metafile_resv_used; __entry->len = len; ), - TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu", + TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, __entry->freeblks, __entry->reserved, __entry->asked, @@ -5639,14 +5787,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, ) #define DEFINE_METAFILE_RESV_EVENT(name) \ DEFINE_EVENT(xfs_metafile_resv_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \ - TP_ARGS(ip, len)) + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \ + TP_ARGS(mp, len)) DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical); -DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error); #ifdef CONFIG_XFS_RT TRACE_EVENT(xfs_growfs_check_rtgeom, @@ -5669,6 +5817,46 @@ TRACE_EVENT(xfs_growfs_check_rtgeom, ); #endif /* CONFIG_XFS_RT */ +TRACE_DEFINE_ENUM(XC_FREE_BLOCKS); +TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS); +TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE); + +DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class, + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, + uint64_t delta, unsigned long caller_ip), + TP_ARGS(mp, ctr, delta, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(enum xfs_free_counter, ctr) + __field(uint64_t, delta) + __field(uint64_t, avail) + __field(uint64_t, total) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ctr = ctr; + __entry->delta = delta; + __entry->avail = mp->m_free[ctr].res_avail; + __entry->total = mp->m_free[ctr].res_total; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR), + __entry->delta, + __entry->avail, + __entry->total, + (char *)__entry->caller_ip) +) +#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_freeblocks_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \ + uint64_t delta, unsigned long caller_ip), \ + TP_ARGS(mp, ctr, delta, caller_ip)) +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved); +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c new file mode 100644 index 000000000000..52af234936a2 --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.c @@ -0,0 +1,1220 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_error.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_iomap.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_refcount.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" + +void +xfs_open_zone_put( + struct xfs_open_zone *oz) +{ + if (atomic_dec_and_test(&oz->oz_ref)) { + xfs_rtgroup_rele(oz->oz_rtg); + kfree(oz); + } +} + +static inline uint32_t +xfs_zone_bucket( + struct xfs_mount *mp, + uint32_t used_blocks) +{ + return XFS_ZONE_USED_BUCKETS * used_blocks / + mp->m_groups[XG_TYPE_RTG].blocks; +} + +static inline void +xfs_zone_add_to_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t to_bucket) +{ + __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]); + zi->zi_used_bucket_entries[to_bucket]++; +} + +static inline void +xfs_zone_remove_from_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t from_bucket) +{ + __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]); + zi->zi_used_bucket_entries[from_bucket]--; +} + +static void +xfs_zone_account_reclaimable( + struct xfs_rtgroup *rtg, + uint32_t freed) +{ + struct xfs_group *xg = &rtg->rtg_group; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgnumber_t rgno = rtg_rgno(rtg); + uint32_t from_bucket = xfs_zone_bucket(mp, used + freed); + uint32_t to_bucket = xfs_zone_bucket(mp, used); + bool was_full = (used + freed == rtg_blocks(rtg)); + + /* + * This can be called from log recovery, where the zone_info structure + * hasn't been allocated yet. Skip all work as xfs_mount_zones will + * add the zones to the right buckets before the file systems becomes + * active. + */ + if (!zi) + return; + + if (!used) { + /* + * The zone is now empty, remove it from the bottom bucket and + * trigger a reset. + */ + trace_xfs_zone_emptied(rtg); + + if (!was_full) + xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); + + spin_lock(&zi->zi_used_buckets_lock); + if (!was_full) + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + spin_lock(&zi->zi_reset_list_lock); + xg->xg_next_reset = zi->zi_reset_list; + zi->zi_reset_list = xg; + spin_unlock(&zi->zi_reset_list_lock); + + if (zi->zi_gc_thread) + wake_up_process(zi->zi_gc_thread); + } else if (was_full) { + /* + * The zone transitioned from full, mark it up as reclaimable + * and wake up GC which might be waiting for zones to reclaim. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); + if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + } else if (to_bucket != from_bucket) { + /* + * Move the zone to a new bucket if it dropped below the + * threshold. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + } +} + +static void +xfs_open_zone_mark_full( + struct xfs_open_zone *oz) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + + trace_xfs_zone_full(rtg); + + WRITE_ONCE(rtg->rtg_open_zone, NULL); + + spin_lock(&zi->zi_open_zones_lock); + if (oz->oz_is_gc) { + ASSERT(current == zi->zi_gc_thread); + zi->zi_open_gc_zone = NULL; + } else { + zi->zi_nr_open_zones--; + list_del_init(&oz->oz_entry); + } + spin_unlock(&zi->zi_open_zones_lock); + xfs_open_zone_put(oz); + + wake_up_all(&zi->zi_zone_wait); + if (used < rtg_blocks(rtg)) + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); +} + +static void +xfs_zone_record_blocks( + struct xfs_trans *tp, + xfs_fsblock_t fsbno, + xfs_filblks_t len, + struct xfs_open_zone *oz, + bool used) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + if (used) { + rmapip->i_used_blocks += len; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); + } else { + xfs_add_frextents(mp, len); + } + oz->oz_written += len; + if (oz->oz_written == rtg_blocks(rtg)) + xfs_open_zone_mark_full(oz); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); +} + +static int +xfs_zoned_map_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *new, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_bmbt_irec data; + int nmaps = 1; + int error; + + /* Grab the corresponding mapping in the data fork. */ + error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, + &nmaps, 0); + if (error) + return error; + + /* + * Cap the update to the existing extent in the data fork because we can + * only overwrite one extent at a time. + */ + ASSERT(new->br_blockcount >= data.br_blockcount); + new->br_blockcount = data.br_blockcount; + + /* + * If a data write raced with this GC write, keep the existing data in + * the data fork, mark our newly written GC extent as reclaimable, then + * move on to the next extent. + */ + if (old_startblock != NULLFSBLOCK && + old_startblock != data.br_startblock) + goto skip; + + trace_xfs_reflink_cow_remap_from(ip, new); + trace_xfs_reflink_cow_remap_to(ip, &data); + + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_REFLINK_END_COW_CNT); + if (error) + return error; + + if (data.br_startblock != HOLESTARTBLOCK) { + ASSERT(data.br_startblock != DELAYSTARTBLOCK); + ASSERT(!isnullstartblock(data.br_startblock)); + + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); + if (xfs_is_reflink_inode(ip)) { + xfs_refcount_decrease_extent(tp, true, &data); + } else { + error = xfs_free_extent_later(tp, data.br_startblock, + data.br_blockcount, NULL, + XFS_AG_RESV_NONE, + XFS_FREE_EXTENT_REALTIME); + if (error) + return error; + } + } + + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + true); + + /* Map the new blocks into the data fork. */ + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); + return 0; + +skip: + trace_xfs_reflink_cow_remap_skip(ip, new); + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + false); + return 0; +} + +int +xfs_zoned_end_io( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count, + xfs_daddr_t daddr, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + struct xfs_bmbt_irec new = { + .br_startoff = XFS_B_TO_FSBT(mp, offset), + .br_startblock = xfs_daddr_to_rtb(mp, daddr), + .br_state = XFS_EXT_NORM, + }; + unsigned int resblks = + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + struct xfs_trans *tp; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + + while (new.br_startoff < end_fsb) { + new.br_blockcount = end_fsb - new.br_startoff; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + new.br_startoff += new.br_blockcount; + new.br_startblock += new.br_blockcount; + if (old_startblock != NULLFSBLOCK) + old_startblock += new.br_blockcount; + } + + return 0; +} + +/* + * "Free" blocks allocated in a zone. + * + * Just decrement the used blocks counter and report the space as freed. + */ +int +xfs_zone_free_blocks( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, + xfs_filblks_t len) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); + + if (len > rmapip->i_used_blocks) { + xfs_err(mp, +"trying to free more blocks (%lld) than used counter (%u).", + len, rmapip->i_used_blocks); + ASSERT(len <= rmapip->i_used_blocks); + xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EFSCORRUPTED; + } + + trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); + + rmapip->i_used_blocks -= len; + /* + * Don't add open zones to the reclaimable buckets. The I/O completion + * for writing the last block will take care of accounting for already + * unused blocks instead. + */ + if (!READ_ONCE(rtg->rtg_open_zone)) + xfs_zone_account_reclaimable(rtg, len); + xfs_add_frextents(mp, len); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); + return 0; +} + +/* + * Check if the zone containing the data just before the offset we are + * writing to is still open and has space. + */ +static struct xfs_open_zone * +xfs_last_used_zone( + struct iomap_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset); + struct xfs_rtgroup *rtg = NULL; + struct xfs_open_zone *oz = NULL; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb, + &icur, &got)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return NULL; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock)); + if (!rtg) + return NULL; + + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + oz = READ_ONCE(rtg->rtg_open_zone); + if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref))) + oz = NULL; + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + + xfs_rtgroup_rele(rtg); + return oz; +} + +static struct xfs_group * +xfs_find_free_zone( + struct xfs_mount *mp, + unsigned long start, + unsigned long end) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start); + struct xfs_group *xg; + + xas_lock(&xas); + xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE) + if (atomic_inc_not_zero(&xg->xg_active_ref)) + goto found; + xas_unlock(&xas); + return NULL; + +found: + xas_clear_mark(&xas, XFS_RTG_FREE); + atomic_dec(&zi->zi_nr_free_zones); + zi->zi_free_zone_cursor = xg->xg_gno; + xas_unlock(&xas); + return xg; +} + +static struct xfs_open_zone * +xfs_init_open_zone( + struct xfs_rtgroup *rtg, + xfs_rgblock_t write_pointer, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_open_zone *oz; + + oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL); + spin_lock_init(&oz->oz_alloc_lock); + atomic_set(&oz->oz_ref, 1); + oz->oz_rtg = rtg; + oz->oz_write_pointer = write_pointer; + oz->oz_written = write_pointer; + oz->oz_write_hint = write_hint; + oz->oz_is_gc = is_gc; + + /* + * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap + * inode, but we don't really want to take that here because we are + * under the zone_list_lock. Ensure the pointer is only set for a fully + * initialized open zone structure so that a racy lookup finding it is + * fine. + */ + WRITE_ONCE(rtg->rtg_open_zone, oz); + return oz; +} + +/* + * Find a completely free zone, open it, and return a reference. + */ +struct xfs_open_zone * +xfs_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_group *xg; + + xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); + if (!xg) + xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); + if (!xg) + return NULL; + + set_current_state(TASK_RUNNING); + return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); +} + +static struct xfs_open_zone * +xfs_try_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) + return NULL; + if (atomic_read(&zi->zi_nr_free_zones) < + XFS_GC_ZONES - XFS_OPEN_GC_ZONES) + return NULL; + + /* + * Increment the open zone count to reserve our slot before dropping + * zi_open_zones_lock. + */ + zi->zi_nr_open_zones++; + spin_unlock(&zi->zi_open_zones_lock); + oz = xfs_open_zone(mp, write_hint, false); + spin_lock(&zi->zi_open_zones_lock); + if (!oz) { + zi->zi_nr_open_zones--; + return NULL; + } + + atomic_inc(&oz->oz_ref); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + + /* + * If this was the last free zone, other waiters might be waiting + * on us to write to it as well. + */ + wake_up_all(&zi->zi_zone_wait); + + if (xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + + trace_xfs_zone_opened(oz->oz_rtg); + return oz; +} + +/* + * For data with short or medium lifetime, try to colocated it into an + * already open zone with a matching temperature. + */ +static bool +xfs_colocate_eagerly( + enum rw_hint file_hint) +{ + switch (file_hint) { + case WRITE_LIFE_MEDIUM: + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + return true; + default: + return false; + } +} + +static bool +xfs_good_hint_match( + struct xfs_open_zone *oz, + enum rw_hint file_hint) +{ + switch (oz->oz_write_hint) { + case WRITE_LIFE_LONG: + case WRITE_LIFE_EXTREME: + /* colocate long and extreme */ + if (file_hint == WRITE_LIFE_LONG || + file_hint == WRITE_LIFE_EXTREME) + return true; + break; + case WRITE_LIFE_MEDIUM: + /* colocate medium with medium */ + if (file_hint == WRITE_LIFE_MEDIUM) + return true; + break; + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + case WRITE_LIFE_NOT_SET: + /* colocate short and none */ + if (file_hint <= WRITE_LIFE_SHORT) + return true; + break; + } + return false; +} + +static bool +xfs_try_use_zone( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + struct xfs_open_zone *oz, + bool lowspace) +{ + if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return false; + if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) + return false; + + /* + * If we have a hint set for the data, use that for the zone even if + * some data was written already without any hint set, but don't change + * the temperature after that as that would make little sense without + * tracking per-temperature class written block counts, which is + * probably overkill anyway. + */ + if (file_hint != WRITE_LIFE_NOT_SET && + oz->oz_write_hint == WRITE_LIFE_NOT_SET) + oz->oz_write_hint = file_hint; + + /* + * If we couldn't match by inode or life time we just pick the first + * zone with enough space above. For that we want the least busy zone + * for some definition of "least" busy. For now this simple LRU + * algorithm that rotates every zone to the end of the list will do it, + * even if it isn't exactly cache friendly. + */ + if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) + list_move_tail(&oz->oz_entry, &zi->zi_open_zones); + return true; +} + +static struct xfs_open_zone * +xfs_select_open_zone_lru( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + bool lowspace) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static struct xfs_open_zone * +xfs_select_open_zone_mru( + struct xfs_zone_info *zi, + enum rw_hint file_hint) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, false)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) +{ + if (xfs_has_nolifetime(ip->i_mount)) + return WRITE_LIFE_NOT_SET; + return VFS_I(ip)->i_write_hint; +} + +/* + * Try to pack inodes that are written back after they were closed tight instead + * of trying to open new zones for them or spread them to the least recently + * used zone. This optimizes the data layout for workloads that untar or copy + * a lot of small files. Right now this does not separate multiple such + * streams. + */ +static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) +{ + return !inode_is_open_for_write(VFS_I(ip)) && + !(ip->i_diflags & XFS_DIFLAG_APPEND); +} + +/* + * Pick a new zone for writes. + * + * If we aren't using up our budget of open zones just open a new one from the + * freelist. Else try to find one that matches the expected data lifetime. If + * we don't find one that is good pick any zone that is available. + */ +static struct xfs_open_zone * +xfs_select_zone_nowait( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = NULL; + + if (xfs_is_shutdown(mp)) + return NULL; + + /* + * Try to fill up open zones with matching temperature if available. It + * is better to try to co-locate data when this is favorable, so we can + * activate empty zones when it is statistically better to separate + * data. + */ + spin_lock(&zi->zi_open_zones_lock); + if (xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + else if (pack_tight) + oz = xfs_select_open_zone_mru(zi, write_hint); + if (oz) + goto out_unlock; + + /* + * See if we can open a new zone and use that. + */ + oz = xfs_try_open_zone(mp, write_hint); + if (oz) + goto out_unlock; + + /* + * Try to colocate cold data with other cold data if we failed to open a + * new zone for it. + */ + if (write_hint != WRITE_LIFE_NOT_SET && + !xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); +out_unlock: + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +static struct xfs_open_zone * +xfs_select_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + DEFINE_WAIT (wait); + struct xfs_open_zone *oz; + + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + return oz; + + for (;;) { + prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + break; + schedule(); + } + finish_wait(&zi->zi_zone_wait, &wait); + return oz; +} + +static unsigned int +xfs_zone_alloc_blocks( + struct xfs_open_zone *oz, + xfs_filblks_t count_fsb, + sector_t *sector, + bool *is_seq) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rgblock_t rgbno; + + spin_lock(&oz->oz_alloc_lock); + count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, + (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer); + if (!count_fsb) { + spin_unlock(&oz->oz_alloc_lock); + return 0; + } + rgbno = oz->oz_write_pointer; + oz->oz_write_pointer += count_fsb; + spin_unlock(&oz->oz_alloc_lock); + + trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb); + + *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); + if (!*is_seq) + *sector += XFS_FSB_TO_BB(mp, rgbno); + return XFS_FSB_TO_B(mp, count_fsb); +} + +void +xfs_mark_rtg_boundary( + struct iomap_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + sector_t sector = ioend->io_bio.bi_iter.bi_sector; + + if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) + ioend->io_flags |= IOMAP_IOEND_BOUNDARY; +} + +static void +xfs_submit_zoned_bio( + struct iomap_ioend *ioend, + struct xfs_open_zone *oz, + bool is_seq) +{ + ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; + ioend->io_private = oz; + atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ + + if (is_seq) { + ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; + ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; + } else { + xfs_mark_rtg_boundary(ioend); + } + + submit_bio(&ioend->io_bio); +} + +void +xfs_zone_alloc_and_submit( + struct iomap_ioend *ioend, + struct xfs_open_zone **oz) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + enum rw_hint write_hint = xfs_inode_write_hint(ip); + bool pack_tight = xfs_zoned_pack_tight(ip); + unsigned int alloc_len; + struct iomap_ioend *split; + bool is_seq; + + if (xfs_is_shutdown(mp)) + goto out_error; + + /* + * If we don't have a cached zone in this write context, see if the + * last extent before the one we are writing to points to an active + * zone. If so, just continue writing to it. + */ + if (!*oz && ioend->io_offset) + *oz = xfs_last_used_zone(ioend); + if (!*oz) { +select_zone: + *oz = xfs_select_zone(mp, write_hint, pack_tight); + if (!*oz) + goto out_error; + } + + alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), + &ioend->io_sector, &is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + + while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { + if (IS_ERR(split)) + goto out_split_error; + alloc_len -= split->io_bio.bi_iter.bi_size; + xfs_submit_zoned_bio(split, *oz, is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + } + + xfs_submit_zoned_bio(ioend, *oz, is_seq); + return; + +out_split_error: + ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); +out_error: + bio_io_error(&ioend->io_bio); +} + +/* + * Wake up all threads waiting for a zoned space allocation when the file system + * is shut down. + */ +void +xfs_zoned_wake_all( + struct xfs_mount *mp) +{ + /* + * Don't wake up if there is no m_zone_info. This is complicated by the + * fact that unmount can't atomically clear m_zone_info and thus we need + * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE + * during log recovery so we can't entirely rely on that either. + */ + if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info) + wake_up_all(&mp->m_zone_info->zi_zone_wait); +} + +/* + * Check if @rgbno in @rgb is a potentially valid block. It might still be + * unused, but that information is only found in the rmap. + */ +bool +xfs_zone_rgbno_is_valid( + struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgbno) +{ + lockdep_assert_held(&rtg_rmap(rtg)->i_lock); + + if (rtg->rtg_open_zone) + return rgbno < rtg->rtg_open_zone->oz_write_pointer; + return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, + rtg_rgno(rtg), XFS_RTG_FREE); +} + +static void +xfs_free_open_zones( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz; + + spin_lock(&zi->zi_open_zones_lock); + while ((oz = list_first_entry_or_null(&zi->zi_open_zones, + struct xfs_open_zone, oz_entry))) { + list_del(&oz->oz_entry); + xfs_open_zone_put(oz); + } + spin_unlock(&zi->zi_open_zones_lock); +} + +struct xfs_init_zones { + struct xfs_mount *mp; + uint64_t available; + uint64_t reclaimable; +}; + +static int +xfs_init_zone( + struct xfs_init_zones *iz, + struct xfs_rtgroup *rtg, + struct blk_zone *zone) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint64_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgblock_t write_pointer, highest_rgbno; + int error; + + if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) + return -EFSCORRUPTED; + + /* + * For sequential write required zones we retrieved the hardware write + * pointer above. + * + * For conventional zones or conventional devices we don't have that + * luxury. Instead query the rmap to find the highest recorded block + * and set the write pointer to the block after that. In case of a + * power loss this misses blocks where the data I/O has completed but + * not recorded in the rmap yet, and it also rewrites blocks if the most + * recently written ones got deleted again before unmount, but this is + * the best we can do without hardware support. + */ + if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); + if (highest_rgbno == NULLRGBLOCK) + write_pointer = 0; + else + write_pointer = highest_rgbno + 1; + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + } + + /* + * If there are no used blocks, but the zone is not in empty state yet + * we lost power before the zoned reset. In that case finish the work + * here. + */ + if (write_pointer == rtg_blocks(rtg) && used == 0) { + error = xfs_zone_gc_reset_sync(rtg); + if (error) + return error; + write_pointer = 0; + } + + if (write_pointer == 0) { + /* zone is empty */ + atomic_inc(&zi->zi_nr_free_zones); + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + iz->available += rtg_blocks(rtg); + } else if (write_pointer < rtg_blocks(rtg)) { + /* zone is open */ + struct xfs_open_zone *oz; + + atomic_inc(&rtg_group(rtg)->xg_active_ref); + oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, + false); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + zi->zi_nr_open_zones++; + + iz->available += (rtg_blocks(rtg) - write_pointer); + iz->reclaimable += write_pointer - used; + } else if (used < rtg_blocks(rtg)) { + /* zone fully written, but has freed blocks */ + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); + iz->reclaimable += (rtg_blocks(rtg) - used); + } + + return 0; +} + +static int +xfs_get_zone_info_cb( + struct blk_zone *zone, + unsigned int idx, + void *data) +{ + struct xfs_init_zones *iz = data; + struct xfs_mount *mp = iz->mp; + xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start); + xfs_rgnumber_t rgno; + struct xfs_rtgroup *rtg; + int error; + + if (xfs_rtb_to_rgbno(mp, zsbno) != 0) { + xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); + return -EFSCORRUPTED; + } + + rgno = xfs_rtb_to_rgno(mp, zsbno); + rtg = xfs_rtgroup_grab(mp, rgno); + if (!rtg) { + xfs_warn(mp, "realtime group not found for zone %u.", rgno); + return -EFSCORRUPTED; + } + error = xfs_init_zone(iz, rtg, zone); + xfs_rtgroup_rele(rtg); + return error; +} + +/* + * Calculate the max open zone limit based on the of number of + * backing zones available + */ +static inline uint32_t +xfs_max_open_zones( + struct xfs_mount *mp) +{ + unsigned int max_open, max_open_data_zones; + /* + * We need two zones for every open data zone, + * one in reserve as we don't reclaim open zones. One data zone + * and its spare is included in XFS_MIN_ZONES. + */ + max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; + max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; + + /* + * Cap the max open limit to 1/4 of available space + */ + max_open = min(max_open, mp->m_sb.sb_rgcount / 4); + + return max(XFS_MIN_OPEN_ZONES, max_open); +} + +/* + * Normally we use the open zone limit that the device reports. If there is + * none let the user pick one from the command line. + * + * If the device doesn't report an open zone limit and there is no override, + * allow to hold about a quarter of the zones open. In theory we could allow + * all to be open, but at that point we run into GC deadlocks because we can't + * reclaim open zones. + * + * When used on conventional SSDs a lower open limit is advisable as we'll + * otherwise overwhelm the FTL just as much as a conventional block allocator. + * + * Note: To debug the open zone management code, force max_open to 1 here. + */ +static int +xfs_calc_open_zones( + struct xfs_mount *mp) +{ + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + unsigned int bdev_open_zones = bdev_max_open_zones(bdev); + + if (!mp->m_max_open_zones) { + if (bdev_open_zones) + mp->m_max_open_zones = bdev_open_zones; + else + mp->m_max_open_zones = xfs_max_open_zones(mp); + } + + if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { + xfs_notice(mp, "need at least %u open zones.", + XFS_MIN_OPEN_ZONES); + return -EIO; + } + + if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { + mp->m_max_open_zones = bdev_open_zones; + xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", + bdev_open_zones); + } + + if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { + mp->m_max_open_zones = xfs_max_open_zones(mp); + xfs_info(mp, +"limiting open zones to %u due to total zone count (%u)", + mp->m_max_open_zones, mp->m_sb.sb_rgcount); + } + + return 0; +} + +static unsigned long * +xfs_alloc_bucket_bitmap( + struct xfs_mount *mp) +{ + return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount), + sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); +} + +static struct xfs_zone_info * +xfs_alloc_zone_info( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi; + int i; + + zi = kzalloc(sizeof(*zi), GFP_KERNEL); + if (!zi) + return NULL; + INIT_LIST_HEAD(&zi->zi_open_zones); + INIT_LIST_HEAD(&zi->zi_reclaim_reservations); + spin_lock_init(&zi->zi_reset_list_lock); + spin_lock_init(&zi->zi_open_zones_lock); + spin_lock_init(&zi->zi_reservation_lock); + init_waitqueue_head(&zi->zi_zone_wait); + spin_lock_init(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); + if (!zi->zi_used_bucket_bitmap[i]) + goto out_free_bitmaps; + } + return zi; + +out_free_bitmaps: + while (--i > 0) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); + return NULL; +} + +static void +xfs_free_zone_info( + struct xfs_zone_info *zi) +{ + int i; + + xfs_free_open_zones(zi); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); +} + +int +xfs_mount_zones( + struct xfs_mount *mp) +{ + struct xfs_init_zones iz = { + .mp = mp, + }; + struct xfs_buftarg *bt = mp->m_rtdev_targp; + int error; + + if (!bt) { + xfs_notice(mp, "RT device missing."); + return -EINVAL; + } + + if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { + xfs_notice(mp, "invalid flag combination."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rextsize != 1) { + xfs_notice(mp, "zoned file systems do not support rextsize."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { + xfs_notice(mp, +"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); + return -EFSCORRUPTED; + } + + error = xfs_calc_open_zones(mp); + if (error) + return error; + + mp->m_zone_info = xfs_alloc_zone_info(mp); + if (!mp->m_zone_info) + return -ENOMEM; + + xfs_info(mp, "%u zones of %u blocks size (%u max open)", + mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, + mp->m_max_open_zones); + trace_xfs_zones_mount(mp); + + if (bdev_is_zoned(bt->bt_bdev)) { + error = blkdev_report_zones(bt->bt_bdev, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), + mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); + if (error < 0) + goto out_free_zone_info; + } else { + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + error = xfs_init_zone(&iz, rtg, NULL); + if (error) + goto out_free_zone_info; + } + } + + xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + iz.available + iz.reclaimable); + + error = xfs_zone_gc_mount(mp); + if (error) + goto out_free_zone_info; + return 0; + +out_free_zone_info: + xfs_free_zone_info(mp->m_zone_info); + return error; +} + +void +xfs_unmount_zones( + struct xfs_mount *mp) +{ + xfs_zone_gc_unmount(mp); + xfs_free_zone_info(mp->m_zone_info); +} diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h new file mode 100644 index 000000000000..ecf39106704c --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_ALLOC_H +#define _XFS_ZONE_ALLOC_H + +struct iomap_ioend; +struct xfs_open_zone; + +struct xfs_zone_alloc_ctx { + struct xfs_open_zone *open_zone; + xfs_filblks_t reserved_blocks; +}; + +/* + * Grab any available space, even if it is less than what the caller asked for. + */ +#define XFS_ZR_GREEDY (1U << 0) +/* + * Only grab instantly available space, don't wait or GC. + */ +#define XFS_ZR_NOWAIT (1U << 1) +/* + * Dip into the reserved pool. + */ +#define XFS_ZR_RESERVED (1U << 2) + +int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb, + unsigned int flags, struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_space_unreserve(struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb); + +void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend, + struct xfs_open_zone **oz); +int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, xfs_filblks_t len); +int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, + xfs_daddr_t daddr, struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock); +void xfs_open_zone_put(struct xfs_open_zone *oz); + +void xfs_zoned_wake_all(struct xfs_mount *mp); +bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno); +void xfs_mark_rtg_boundary(struct iomap_ioend *ioend); + +uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); +void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp); + +#ifdef CONFIG_XFS_RT +int xfs_mount_zones(struct xfs_mount *mp); +void xfs_unmount_zones(struct xfs_mount *mp); +void xfs_zone_gc_start(struct xfs_mount *mp); +void xfs_zone_gc_stop(struct xfs_mount *mp); +#else +static inline int xfs_mount_zones(struct xfs_mount *mp) +{ + return -EIO; +} +static inline void xfs_unmount_zones(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_start(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_stop(struct xfs_mount *mp) +{ +} +#endif /* CONFIG_XFS_RT */ + +#endif /* _XFS_ZONE_ALLOC_H */ diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c new file mode 100644 index 000000000000..c5136ea9bb1d --- /dev/null +++ b/fs/xfs/xfs_zone_gc.c @@ -0,0 +1,1165 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" + +/* + * Implement Garbage Collection (GC) of partially used zoned. + * + * To support the purely sequential writes in each zone, zoned XFS needs to be + * able to move data remaining in a zone out of it to reset the zone to prepare + * for writing to it again. + * + * This is done by the GC thread implemented in this file. To support that a + * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to + * write the garbage collected data into. + * + * Whenever the available space is below the chosen threshold, the GC thread + * looks for potential non-empty but not fully used zones that are worth + * reclaiming. Once found the rmap for the victim zone is queried, and after + * a bit of sorting to reduce fragmentation, the still live extents are read + * into memory and written to the GC target zone, and the bmap btree of the + * files is updated to point to the new location. To avoid taking the IOLOCK + * and MMAPLOCK for the entire GC process and thus affecting the latency of + * user reads and writes to the files, the GC writes are speculative and the + * I/O completion checks that no other writes happened for the affected regions + * before remapping. + * + * Once a zone does not contain any valid data, be that through GC or user + * block removal, it is queued for for a zone reset. The reset operation + * carefully ensures that the RT device cache is flushed and all transactions + * referencing the rmap have been committed to disk. + */ + +/* + * Size of each GC scratch pad. This is also the upper bound for each + * GC I/O, which helps to keep latency down. + */ +#define XFS_GC_CHUNK_SIZE SZ_1M + +/* + * Scratchpad data to read GCed data into. + * + * The offset member tracks where the next allocation starts, and freed tracks + * the amount of space that is not used anymore. + */ +#define XFS_ZONE_GC_NR_SCRATCH 2 +struct xfs_zone_scratch { + struct folio *folio; + unsigned int offset; + unsigned int freed; +}; + +/* + * Chunk that is read and written for each GC operation. + * + * Note that for writes to actual zoned devices, the chunk can be split when + * reaching the hardware limit. + */ +struct xfs_gc_bio { + struct xfs_zone_gc_data *data; + + /* + * Entry into the reading/writing/resetting list. Only accessed from + * the GC thread, so no locking needed. + */ + struct list_head entry; + + /* + * State of this gc_bio. Done means the current I/O completed. + * Set from the bio end I/O handler, read from the GC thread. + */ + enum { + XFS_GC_BIO_NEW, + XFS_GC_BIO_DONE, + } state; + + /* + * Pointer to the inode and byte range in the inode that this + * GC chunk is operating on. + */ + struct xfs_inode *ip; + loff_t offset; + unsigned int len; + + /* + * Existing startblock (in the zone to be freed) and newly assigned + * daddr in the zone GCed into. + */ + xfs_fsblock_t old_startblock; + xfs_daddr_t new_daddr; + struct xfs_zone_scratch *scratch; + + /* Are we writing to a sequential write required zone? */ + bool is_seq; + + /* Open Zone being written to */ + struct xfs_open_zone *oz; + + /* Bio used for reads and writes, including the bvec used by it */ + struct bio_vec bv; + struct bio bio; /* must be last */ +}; + +#define XFS_ZONE_GC_RECS 1024 + +/* iterator, needs to be reinitialized for each victim zone */ +struct xfs_zone_gc_iter { + struct xfs_rtgroup *victim_rtg; + unsigned int rec_count; + unsigned int rec_idx; + xfs_agblock_t next_startblock; + struct xfs_rmap_irec *recs; +}; + +/* + * Per-mount GC state. + */ +struct xfs_zone_gc_data { + struct xfs_mount *mp; + + /* bioset used to allocate the gc_bios */ + struct bio_set bio_set; + + /* + * Scratchpad used, and index to indicated which one is used. + */ + struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; + unsigned int scratch_idx; + + /* + * List of bios currently being read, written and reset. + * These lists are only accessed by the GC thread itself, and must only + * be processed in order. + */ + struct list_head reading; + struct list_head writing; + struct list_head resetting; + + /* + * Iterator for the victim zone. + */ + struct xfs_zone_gc_iter iter; +}; + +/* + * We aim to keep enough zones free in stock to fully use the open zone limit + * for data placement purposes. + */ +bool +xfs_zoned_need_gc( + struct xfs_mount *mp) +{ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) + return false; + if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < + mp->m_groups[XG_TYPE_RTG].blocks * + (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + return true; + return false; +} + +static struct xfs_zone_gc_data * +xfs_zone_gc_data_alloc( + struct xfs_mount *mp) +{ + struct xfs_zone_gc_data *data; + int i; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), + GFP_KERNEL); + if (!data->iter.recs) + goto out_free_data; + + /* + * We actually only need a single bio_vec. It would be nice to have + * a flag that only allocates the inline bvecs and not the separate + * bvec pool. + */ + if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_recs; + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { + data->scratch[i].folio = + folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); + if (!data->scratch[i].folio) + goto out_free_scratch; + } + INIT_LIST_HEAD(&data->reading); + INIT_LIST_HEAD(&data->writing); + INIT_LIST_HEAD(&data->resetting); + data->mp = mp; + return data; + +out_free_scratch: + while (--i >= 0) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); +out_free_recs: + kfree(data->iter.recs); +out_free_data: + kfree(data); + return NULL; +} + +static void +xfs_zone_gc_data_free( + struct xfs_zone_gc_data *data) +{ + int i; + + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); + kfree(data->iter.recs); + kfree(data); +} + +static void +xfs_zone_gc_iter_init( + struct xfs_zone_gc_iter *iter, + struct xfs_rtgroup *victim_rtg) + +{ + iter->next_startblock = 0; + iter->rec_count = 0; + iter->rec_idx = 0; + iter->victim_rtg = victim_rtg; +} + +/* + * Query the rmap of the victim zone to gather the records to evacuate. + */ +static int +xfs_zone_gc_query_cb( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec, + void *private) +{ + struct xfs_zone_gc_iter *iter = private; + + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); + + iter->recs[iter->rec_count] = *irec; + if (++iter->rec_count == XFS_ZONE_GC_RECS) { + iter->next_startblock = + irec->rm_startblock + irec->rm_blockcount; + return 1; + } + return 0; +} + +#define cmp_int(l, r) ((l > r) - (l < r)) + +static int +xfs_zone_gc_rmap_rec_cmp( + const void *a, + const void *b) +{ + const struct xfs_rmap_irec *reca = a; + const struct xfs_rmap_irec *recb = b; + int diff; + + diff = cmp_int(reca->rm_owner, recb->rm_owner); + if (diff) + return diff; + return cmp_int(reca->rm_offset, recb->rm_offset); +} + +static int +xfs_zone_gc_query( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter) +{ + struct xfs_rtgroup *rtg = iter->victim_rtg; + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_btree_cur *cur; + struct xfs_trans *tp; + int error; + + ASSERT(iter->next_startblock <= rtg_blocks(rtg)); + if (iter->next_startblock == rtg_blocks(rtg)) + goto done; + + ASSERT(iter->next_startblock < rtg_blocks(rtg)); + ri_low.rm_startblock = iter->next_startblock; + memset(&ri_high, 0xFF, sizeof(ri_high)); + + iter->rec_idx = 0; + iter->rec_count = 0; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_zone_gc_query_cb, iter); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + xfs_btree_del_cursor(cur, error < 0 ? error : 0); + xfs_trans_cancel(tp); + + if (error < 0) + return error; + + /* + * Sort the rmap records by inode number and increasing offset to + * defragment the mappings. + * + * This could be further enhanced by an even bigger look ahead window, + * but that's better left until we have better detection of changes to + * inode mapping to avoid the potential of GCing already dead data. + */ + sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), + xfs_zone_gc_rmap_rec_cmp, NULL); + + if (error == 0) { + /* + * We finished iterating through the zone. + */ + iter->next_startblock = rtg_blocks(rtg); + if (iter->rec_count == 0) + goto done; + } + + return 0; +done: + xfs_rtgroup_rele(iter->victim_rtg); + iter->victim_rtg = NULL; + return 0; +} + +static bool +xfs_zone_gc_iter_next( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter, + struct xfs_rmap_irec *chunk_rec, + struct xfs_inode **ipp) +{ + struct xfs_rmap_irec *irec; + int error; + + if (!iter->victim_rtg) + return false; + +retry: + if (iter->rec_idx == iter->rec_count) { + error = xfs_zone_gc_query(mp, iter); + if (error) + goto fail; + if (!iter->victim_rtg) + return false; + } + + irec = &iter->recs[iter->rec_idx]; + error = xfs_iget(mp, NULL, irec->rm_owner, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); + if (error) { + /* + * If the inode was already deleted, skip over it. + */ + if (error == -ENOENT) { + iter->rec_idx++; + goto retry; + } + goto fail; + } + + if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { + iter->rec_idx++; + xfs_irele(*ipp); + goto retry; + } + + *chunk_rec = *irec; + return true; + +fail: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; +} + +static void +xfs_zone_gc_iter_advance( + struct xfs_zone_gc_iter *iter, + xfs_extlen_t count_fsb) +{ + struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; + + irec->rm_offset += count_fsb; + irec->rm_startblock += count_fsb; + irec->rm_blockcount -= count_fsb; + if (!irec->rm_blockcount) + iter->rec_idx++; +} + +static struct xfs_rtgroup * +xfs_zone_gc_pick_victim_from( + struct xfs_mount *mp, + uint32_t bucket) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t victim_used = U32_MAX; + struct xfs_rtgroup *victim_rtg = NULL; + uint32_t bit; + + if (!zi->zi_used_bucket_entries[bucket]) + return NULL; + + for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], + mp->m_sb.sb_rgcount) { + struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); + + if (!rtg) + continue; + + /* skip zones that are just waiting for a reset */ + if (rtg_rmap(rtg)->i_used_blocks == 0 || + rtg_rmap(rtg)->i_used_blocks >= victim_used) { + xfs_rtgroup_rele(rtg); + continue; + } + + if (victim_rtg) + xfs_rtgroup_rele(victim_rtg); + victim_rtg = rtg; + victim_used = rtg_rmap(rtg)->i_used_blocks; + + /* + * Any zone that is less than 1 percent used is fair game for + * instant reclaim. All of these zones are in the last + * bucket, so avoid the expensive division for the zones + * in the other buckets. + */ + if (bucket == 0 && + rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) + break; + } + + return victim_rtg; +} + +/* + * Iterate through all zones marked as reclaimable and find a candidate to + * reclaim. + */ +static bool +xfs_zone_gc_select_victim( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_rtgroup *victim_rtg = NULL; + unsigned int bucket; + + if (xfs_is_shutdown(mp)) + return false; + + if (iter->victim_rtg) + return true; + + /* + * Don't start new work if we are asked to stop or park. + */ + if (kthread_should_stop() || kthread_should_park()) + return false; + + if (!xfs_zoned_need_gc(mp)) + return false; + + spin_lock(&zi->zi_used_buckets_lock); + for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { + victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); + if (victim_rtg) + break; + } + spin_unlock(&zi->zi_used_buckets_lock); + + if (!victim_rtg) + return false; + + trace_xfs_zone_gc_select_victim(victim_rtg, bucket); + xfs_zone_gc_iter_init(iter, victim_rtg); + return true; +} + +static struct xfs_open_zone * +xfs_zone_gc_steal_open( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz, *found = NULL; + + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { + if (!found || + oz->oz_write_pointer < found->oz_write_pointer) + found = oz; + } + + if (found) { + found->oz_is_gc = true; + list_del_init(&found->oz_entry); + zi->zi_nr_open_zones--; + } + + spin_unlock(&zi->zi_open_zones_lock); + return found; +} + +static struct xfs_open_zone * +xfs_zone_gc_select_target( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = zi->zi_open_gc_zone; + + /* + * We need to wait for pending writes to finish. + */ + if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) + return NULL; + + ASSERT(zi->zi_nr_open_zones <= + mp->m_max_open_zones - XFS_OPEN_GC_ZONES); + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (oz) + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + spin_lock(&zi->zi_open_zones_lock); + zi->zi_open_gc_zone = oz; + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +/* + * Ensure we have a valid open zone to write the GC data to. + * + * If the current target zone has space keep writing to it, else first wait for + * all pending writes and then pick a new one. + */ +static struct xfs_open_zone * +xfs_zone_gc_ensure_target( + struct xfs_mount *mp) +{ + struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; + + if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return xfs_zone_gc_select_target(mp); + return oz; +} + +static unsigned int +xfs_zone_gc_scratch_available( + struct xfs_zone_gc_data *data) +{ + return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; +} + +static bool +xfs_zone_gc_space_available( + struct xfs_zone_gc_data *data) +{ + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(data->mp); + if (!oz) + return false; + return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && + xfs_zone_gc_scratch_available(data); +} + +static void +xfs_zone_gc_end_io( + struct bio *bio) +{ + struct xfs_gc_bio *chunk = + container_of(bio, struct xfs_gc_bio, bio); + struct xfs_zone_gc_data *data = chunk->data; + + WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); + wake_up_process(data->mp->m_zone_info->zi_gc_thread); +} + +static struct xfs_open_zone * +xfs_zone_gc_alloc_blocks( + struct xfs_zone_gc_data *data, + xfs_extlen_t *count_fsb, + xfs_daddr_t *daddr, + bool *is_seq) +{ + struct xfs_mount *mp = data->mp; + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(mp); + if (!oz) + return NULL; + + *count_fsb = min(*count_fsb, + XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); + + /* + * Directly allocate GC blocks from the reserved pool. + * + * If we'd take them from the normal pool we could be stealing blocks + * from a regular writer, which would then have to wait for GC and + * deadlock. + */ + spin_lock(&mp->m_sb_lock); + *count_fsb = min(*count_fsb, + rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); + *count_fsb = min3(*count_fsb, + mp->m_free[XC_FREE_RTEXTENTS].res_avail, + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; + mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; + spin_unlock(&mp->m_sb_lock); + + if (!*count_fsb) + return NULL; + + *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); + if (!*is_seq) + *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); + oz->oz_write_pointer += *count_fsb; + atomic_inc(&oz->oz_ref); + return oz; +} + +static bool +xfs_zone_gc_start_chunk( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + struct xfs_open_zone *oz; + struct xfs_rmap_irec irec; + struct xfs_gc_bio *chunk; + struct xfs_inode *ip; + struct bio *bio; + xfs_daddr_t daddr; + bool is_seq; + + if (xfs_is_shutdown(mp)) + return false; + + if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) + return false; + oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, + &is_seq); + if (!oz) { + xfs_irele(ip); + return false; + } + + bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->ip = ip; + chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); + chunk->old_startblock = + xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); + chunk->new_daddr = daddr; + chunk->is_seq = is_seq; + chunk->scratch = &data->scratch[data->scratch_idx]; + chunk->data = data; + chunk->oz = oz; + + bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); + bio->bi_end_io = xfs_zone_gc_end_io; + bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, + chunk->scratch->offset); + chunk->scratch->offset += chunk->len; + if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { + data->scratch_idx = + (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; + } + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->reading); + xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); + + submit_bio(bio); + return true; +} + +static void +xfs_zone_gc_free_chunk( + struct xfs_gc_bio *chunk) +{ + list_del(&chunk->entry); + xfs_open_zone_put(chunk->oz); + xfs_irele(chunk->ip); + bio_put(&chunk->bio); +} + +static void +xfs_zone_gc_submit_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + if (chunk->is_seq) { + chunk->bio.bi_opf &= ~REQ_OP_WRITE; + chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; + } + chunk->bio.bi_iter.bi_sector = chunk->new_daddr; + chunk->bio.bi_end_io = xfs_zone_gc_end_io; + submit_bio(&chunk->bio); +} + +static struct xfs_gc_bio * +xfs_zone_gc_split_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + struct queue_limits *lim = + &bdev_get_queue(chunk->bio.bi_bdev)->limits; + struct xfs_gc_bio *split_chunk; + int split_sectors; + unsigned int split_len; + struct bio *split; + unsigned int nsegs; + + if (!chunk->is_seq) + return NULL; + + split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, + lim->max_zone_append_sectors << SECTOR_SHIFT); + if (!split_sectors) + return NULL; + + /* ensure the split chunk is still block size aligned */ + split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, + data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; + split_len = split_sectors << SECTOR_SHIFT; + + split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); + split_chunk = container_of(split, struct xfs_gc_bio, bio); + split_chunk->data = data; + ihold(VFS_I(chunk->ip)); + split_chunk->ip = chunk->ip; + split_chunk->is_seq = chunk->is_seq; + split_chunk->scratch = chunk->scratch; + split_chunk->offset = chunk->offset; + split_chunk->len = split_len; + split_chunk->old_startblock = chunk->old_startblock; + split_chunk->new_daddr = chunk->new_daddr; + split_chunk->oz = chunk->oz; + atomic_inc(&chunk->oz->oz_ref); + + chunk->offset += split_len; + chunk->len -= split_len; + chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); + + /* add right before the original chunk */ + WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&split_chunk->entry, &chunk->entry); + return split_chunk; +} + +static void +xfs_zone_gc_write_chunk( + struct xfs_gc_bio *chunk) +{ + struct xfs_zone_gc_data *data = chunk->data; + struct xfs_mount *mp = chunk->ip->i_mount; + unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset; + struct xfs_gc_bio *split_chunk; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_move_tail(&chunk->entry, &data->writing); + + bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); + bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, + folio_offset); + + while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) + xfs_zone_gc_submit_write(data, split_chunk); + xfs_zone_gc_submit_write(data, chunk); +} + +static void +xfs_zone_gc_finish_chunk( + struct xfs_gc_bio *chunk) +{ + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + struct xfs_inode *ip = chunk->ip; + struct xfs_mount *mp = ip->i_mount; + int error; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + chunk->scratch->freed += chunk->len; + if (chunk->scratch->freed == chunk->scratch->offset) { + chunk->scratch->offset = 0; + chunk->scratch->freed = 0; + } + + /* + * Cycle through the iolock and wait for direct I/O and layouts to + * ensure no one is reading from the old mapping before it goes away. + * + * Note that xfs_zoned_end_io() below checks that no other writer raced + * with us to update the mapping by checking that the old startblock + * didn't change. + */ + xfs_ilock(ip, iolock); + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); + if (!error) + inode_dio_wait(VFS_I(ip)); + xfs_iunlock(ip, iolock); + if (error) + goto free; + + if (chunk->is_seq) + chunk->new_daddr = chunk->bio.bi_iter.bi_sector; + error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, + chunk->new_daddr, chunk->oz, chunk->old_startblock); +free: + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_zone_gc_free_chunk(chunk); +} + +static void +xfs_zone_gc_finish_reset( + struct xfs_gc_bio *chunk) +{ + struct xfs_rtgroup *rtg = chunk->bio.bi_private; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + + if (chunk->bio.bi_status) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out; + } + + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + atomic_inc(&zi->zi_nr_free_zones); + + xfs_zoned_add_available(mp, rtg_blocks(rtg)); + + wake_up_all(&zi->zi_zone_wait); +out: + list_del(&chunk->entry); + bio_put(&chunk->bio); +} + +static bool +xfs_zone_gc_prepare_reset( + struct bio *bio, + struct xfs_rtgroup *rtg) +{ + trace_xfs_zone_reset(rtg); + + ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); + bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { + if (!bdev_max_discard_sectors(bio->bi_bdev)) + return false; + bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; + bio->bi_iter.bi_size = + XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); + } + + return true; +} + +int +xfs_zone_gc_reset_sync( + struct xfs_rtgroup *rtg) +{ + int error = 0; + struct bio bio; + + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, + REQ_OP_ZONE_RESET); + if (xfs_zone_gc_prepare_reset(&bio, rtg)) + error = submit_bio_wait(&bio); + bio_uninit(&bio); + + return error; +} + +static void +xfs_zone_gc_reset_zones( + struct xfs_zone_gc_data *data, + struct xfs_group *reset_list) +{ + struct xfs_group *next = reset_list; + + if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { + xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); + return; + } + + do { + struct xfs_rtgroup *rtg = to_rtg(next); + struct xfs_gc_bio *chunk; + struct bio *bio; + + xfs_log_force_inode(rtg_rmap(rtg)); + + next = rtg_group(rtg)->xg_next_reset; + rtg_group(rtg)->xg_next_reset = NULL; + + bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, + 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); + bio->bi_private = rtg; + bio->bi_end_io = xfs_zone_gc_end_io; + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->data = data; + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->resetting); + + /* + * Also use the bio to drive the state machine when neither + * zone reset nor discard is supported to keep things simple. + */ + if (xfs_zone_gc_prepare_reset(bio, rtg)) + submit_bio(bio); + else + bio_endio(bio); + } while (next); +} + +/* + * Handle the work to read and write data for GC and to reset the zones, + * including handling all completions. + * + * Note that the order of the chunks is preserved so that we don't undo the + * optimal order established by xfs_zone_gc_query(). + */ +static bool +xfs_zone_gc_handle_work( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_info *zi = data->mp->m_zone_info; + struct xfs_gc_bio *chunk, *next; + struct xfs_group *reset_list; + struct blk_plug plug; + + spin_lock(&zi->zi_reset_list_lock); + reset_list = zi->zi_reset_list; + zi->zi_reset_list = NULL; + spin_unlock(&zi->zi_reset_list_lock); + + if (!xfs_zone_gc_select_victim(data) || + !xfs_zone_gc_space_available(data)) { + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !reset_list) + return false; + } + + __set_current_state(TASK_RUNNING); + try_to_freeze(); + + if (reset_list) + xfs_zone_gc_reset_zones(data, reset_list); + + list_for_each_entry_safe(chunk, next, &data->resetting, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_reset(chunk); + } + + list_for_each_entry_safe(chunk, next, &data->writing, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_chunk(chunk); + } + + blk_start_plug(&plug); + list_for_each_entry_safe(chunk, next, &data->reading, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_write_chunk(chunk); + } + blk_finish_plug(&plug); + + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); + return true; +} + +/* + * Note that the current GC algorithm would break reflinks and thus duplicate + * data that was shared by multiple owners before. Because of that reflinks + * are currently not supported on zoned file systems and can't be created or + * mounted. + */ +static int +xfs_zoned_gcd( + void *private) +{ + struct xfs_zone_gc_data *data = private; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + xfs_set_zonegc_running(mp); + if (xfs_zone_gc_handle_work(data)) + continue; + + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !zi->zi_reset_list) { + xfs_clear_zonegc_running(mp); + xfs_zoned_resv_wake_all(mp); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + kthread_parkme(); + continue; + } + } + + schedule(); + } + xfs_clear_zonegc_running(mp); + + if (data->iter.victim_rtg) + xfs_rtgroup_rele(data->iter.victim_rtg); + + memalloc_nofs_restore(nofs_flag); + xfs_zone_gc_data_free(data); + return 0; +} + +void +xfs_zone_gc_start( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_unpark(mp->m_zone_info->zi_gc_thread); +} + +void +xfs_zone_gc_stop( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_park(mp->m_zone_info->zi_gc_thread); +} + +int +xfs_zone_gc_mount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_gc_data *data; + struct xfs_open_zone *oz; + int error; + + /* + * If there are no free zones available for GC, pick the open zone with + * the least used space to GC into. This should only happen after an + * unclean shutdown near ENOSPC while GC was ongoing. + * + * We also need to do this for the first gc zone allocation if we + * unmounted while at the open limit. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || + zi->zi_nr_open_zones == mp->m_max_open_zones) + oz = xfs_zone_gc_steal_open(zi); + else + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (!oz) { + xfs_warn(mp, "unable to allocate a zone for gc"); + error = -EIO; + goto out; + } + + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + zi->zi_open_gc_zone = oz; + + data = xfs_zone_gc_data_alloc(mp); + if (!data) { + error = -ENOMEM; + goto out_put_gc_zone; + } + + mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, + "xfs-zone-gc/%s", mp->m_super->s_id); + if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { + xfs_warn(mp, "unable to create zone gc thread"); + error = PTR_ERR(mp->m_zone_info->zi_gc_thread); + goto out_free_gc_data; + } + + /* xfs_zone_gc_start will unpark for rw mounts */ + kthread_park(mp->m_zone_info->zi_gc_thread); + return 0; + +out_free_gc_data: + kfree(data); +out_put_gc_zone: + xfs_open_zone_put(zi->zi_open_gc_zone); +out: + return error; +} + +void +xfs_zone_gc_unmount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + + kthread_stop(zi->zi_gc_thread); + if (zi->zi_open_gc_zone) + xfs_open_zone_put(zi->zi_open_gc_zone); +} diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c new file mode 100644 index 000000000000..733bcc2f8645 --- /dev/null +++ b/fs/xfs/xfs_zone_info.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" + +static const char xfs_write_hint_shorthand[6][16] = { + "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"}; + +static inline const char * +xfs_write_hint_to_str( + uint8_t write_hint) +{ + if (write_hint > WRITE_LIFE_EXTREME) + return "UNKNOWN"; + return xfs_write_hint_shorthand[write_hint]; +} + +static void +xfs_show_open_zone( + struct seq_file *m, + struct xfs_open_zone *oz) +{ + seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", + rtg_rgno(oz->oz_rtg), + oz->oz_write_pointer, oz->oz_written, + rtg_rmap(oz->oz_rtg)->i_used_blocks, + xfs_write_hint_to_str(oz->oz_write_hint)); +} + +static void +xfs_show_full_zone_used_distribution( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int reclaimable = 0, full, i; + + spin_lock(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + unsigned int entries = zi->zi_used_bucket_entries[i]; + + seq_printf(m, "\t %2u..%2u%%: %u\n", + i * (100 / XFS_ZONE_USED_BUCKETS), + (i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1, + entries); + reclaimable += entries; + } + spin_unlock(&zi->zi_used_buckets_lock); + + full = mp->m_sb.sb_rgcount; + if (zi->zi_open_gc_zone) + full--; + full -= zi->zi_nr_open_zones; + full -= atomic_read(&zi->zi_nr_free_zones); + full -= reclaimable; + + seq_printf(m, "\t 100%%: %u\n", full); +} + +void +xfs_zoned_show_stats( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + seq_puts(m, "\n"); + + seq_printf(m, "\tuser free RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + seq_printf(m, "\treserved free RT blocks: %lld\n", + mp->m_free[XC_FREE_RTEXTENTS].res_avail); + seq_printf(m, "\tuser available RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE)); + seq_printf(m, "\treserved available RT blocks: %lld\n", + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + seq_printf(m, "\tRT reservations required: %d\n", + !list_empty_careful(&zi->zi_reclaim_reservations)); + seq_printf(m, "\tRT GC required: %d\n", + xfs_zoned_need_gc(mp)); + + seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones)); + seq_puts(m, "\topen zones:\n"); + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + xfs_show_open_zone(m, oz); + if (zi->zi_open_gc_zone) { + seq_puts(m, "\topen gc zone:\n"); + xfs_show_open_zone(m, zi->zi_open_gc_zone); + } + spin_unlock(&zi->zi_open_zones_lock); + seq_puts(m, "\tused blocks distribution (fully written zones):\n"); + xfs_show_full_zone_used_distribution(m, mp); +} diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h new file mode 100644 index 000000000000..ab696975a993 --- /dev/null +++ b/fs/xfs/xfs_zone_priv.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_PRIV_H +#define _XFS_ZONE_PRIV_H + +struct xfs_open_zone { + /* + * Entry in the open zone list and refcount. Protected by + * zi_open_zones_lock in struct xfs_zone_info. + */ + struct list_head oz_entry; + atomic_t oz_ref; + + /* + * oz_write_pointer is the write pointer at which space is handed out + * for conventional zones, or simple the count of blocks handed out + * so far for sequential write required zones and is protected by + * oz_alloc_lock/ + */ + spinlock_t oz_alloc_lock; + xfs_rgblock_t oz_write_pointer; + + /* + * oz_written is the number of blocks for which we've received a + * write completion. oz_written must always be <= oz_write_pointer + * and is protected by the ILOCK of the rmap inode. + */ + xfs_rgblock_t oz_written; + + /* + * Write hint (data temperature) assigned to this zone, or + * WRITE_LIFE_NOT_SET if none was set. + */ + enum rw_hint oz_write_hint; + + /* + * Is this open zone used for garbage collection? There can only be a + * single open GC zone, which is pointed to by zi_open_gc_zone in + * struct xfs_zone_info. Constant over the life time of an open zone. + */ + bool oz_is_gc; + + /* + * Pointer to the RT groups structure for this open zone. Constant over + * the life time of an open zone. + */ + struct xfs_rtgroup *oz_rtg; +}; + +/* + * Number of bitmap buckets to track reclaimable zones. There are 10 buckets + * so that each 10% of the usable capacity get their own bucket and GC can + * only has to walk the bitmaps of the lesser used zones if there are any. + */ +#define XFS_ZONE_USED_BUCKETS 10u + +struct xfs_zone_info { + /* + * List of pending space reservations: + */ + spinlock_t zi_reservation_lock; + struct list_head zi_reclaim_reservations; + + /* + * List and number of open zones: + */ + spinlock_t zi_open_zones_lock; + struct list_head zi_open_zones; + unsigned int zi_nr_open_zones; + + /* + * Free zone search cursor and number of free zones: + */ + unsigned long zi_free_zone_cursor; + atomic_t zi_nr_free_zones; + + /* + * Wait queue to wait for free zones or open zone resources to become + * available: + */ + wait_queue_head_t zi_zone_wait; + + /* + * Pointer to the GC thread, and the current open zone used by GC + * (if any). + * + * zi_open_gc_zone is mostly private to the GC thread, but can be read + * for debugging from other threads, in which case zi_open_zones_lock + * must be taken to access it. + */ + struct task_struct *zi_gc_thread; + struct xfs_open_zone *zi_open_gc_zone; + + /* + * List of zones that need a reset: + */ + spinlock_t zi_reset_list_lock; + struct xfs_group *zi_reset_list; + + /* + * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help + * garbage collection to quickly find the best candidate for reclaim. + */ + spinlock_t zi_used_buckets_lock; + unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS]; + unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS]; + +}; + +struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, + enum rw_hint write_hint, bool is_gc); + +int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg); +bool xfs_zoned_need_gc(struct xfs_mount *mp); +int xfs_zone_gc_mount(struct xfs_mount *mp); +void xfs_zone_gc_unmount(struct xfs_mount *mp); + +void xfs_zoned_resv_wake_all(struct xfs_mount *mp); + +#endif /* _XFS_ZONE_PRIV_H */ diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c new file mode 100644 index 000000000000..93c9a7721139 --- /dev/null +++ b/fs/xfs/xfs_zone_space_resv.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" + +/* + * Note: the zoned allocator does not support a rtextsize > 1, so this code and + * the allocator itself uses file system blocks interchangeable with realtime + * extents without doing the otherwise required conversions. + */ + +/* + * Per-task space reservation. + * + * Tasks that need to wait for GC to free up space allocate one of these + * on-stack and adds it to the per-mount zi_reclaim_reservations lists. + * The GC thread will then wake the tasks in order when space becomes available. + */ +struct xfs_zone_reservation { + struct list_head entry; + struct task_struct *task; + xfs_filblks_t count_fsb; +}; + +/* + * Calculate the number of reserved blocks. + * + * XC_FREE_RTEXTENTS counts the user available capacity, to which the file + * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly + * available for writes without waiting for GC. + * + * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and + * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS + * is further restricted by at least one zone as well as the optional + * persistently reserved blocks. This allows the allocator to run more + * smoothly by not always triggering GC. + */ +uint64_t +xfs_zoned_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + switch (ctr) { + case XC_FREE_RTEXTENTS: + return (uint64_t)XFS_RESERVED_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks + + mp->m_sb.sb_rtreserved; + case XC_FREE_RTAVAILABLE: + return (uint64_t)XFS_GC_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks; + default: + ASSERT(0); + return 0; + } +} + +void +xfs_zoned_resv_wake_all( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + spin_lock(&zi->zi_reservation_lock); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) + wake_up_process(reservation->task); + spin_unlock(&zi->zi_reservation_lock); +} + +void +xfs_zoned_add_available( + struct xfs_mount *mp, + xfs_filblks_t count_fsb) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + if (list_empty_careful(&zi->zi_reclaim_reservations)) { + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + return; + } + + spin_lock(&zi->zi_reservation_lock); + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) { + if (reservation->count_fsb > count_fsb) + break; + wake_up_process(reservation->task); + count_fsb -= reservation->count_fsb; + + } + spin_unlock(&zi->zi_reservation_lock); +} + +static int +xfs_zoned_space_wait_error( + struct xfs_mount *mp) +{ + if (xfs_is_shutdown(mp)) + return -EIO; + if (fatal_signal_pending(current)) + return -EINTR; + return 0; +} + +static int +xfs_zoned_reserve_available( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation reservation = { + .task = current, + .count_fsb = count_fsb, + }; + int error; + + /* + * If there are no waiters, try to directly grab the available blocks + * from the percpu counter. + * + * If the caller wants to dip into the reserved pool also bypass the + * wait list. This relies on the fact that we have a very graciously + * sized reserved pool that always has enough space. If the reserved + * allocations fail we're in trouble. + */ + if (likely(list_empty_careful(&zi->zi_reclaim_reservations) || + (flags & XFS_ZR_RESERVED))) { + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + return error; + } + + if (flags & XFS_ZR_NOWAIT) + return -EAGAIN; + + spin_lock(&zi->zi_reservation_lock); + list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations); + while ((error = xfs_zoned_space_wait_error(mp)) == 0) { + set_current_state(TASK_KILLABLE); + + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + break; + + /* + * Make sure to start GC if it is not running already. As we + * check the rtavailable count when filling up zones, GC is + * normally already running at this point, but in some setups + * with very few zones we may completely run out of non- + * reserved blocks in between filling zones. + */ + if (!xfs_is_zonegc_running(mp)) + wake_up_process(zi->zi_gc_thread); + + /* + * If there is no reclaimable group left and we aren't still + * processing a pending GC request give up as we're fully out + * of space. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && + !xfs_is_zonegc_running(mp)) + break; + + spin_unlock(&zi->zi_reservation_lock); + schedule(); + spin_lock(&zi->zi_reservation_lock); + } + list_del(&reservation.entry); + spin_unlock(&zi->zi_reservation_lock); + + __set_current_state(TASK_RUNNING); + return error; +} + +/* + * Implement greedy space allocation for short writes by trying to grab all + * that is left after locking out other threads from trying to do the same. + * + * This isn't exactly optimal and can hopefully be replaced by a proper + * percpu_counter primitive one day. + */ +static int +xfs_zoned_reserve_extents_greedy( + struct xfs_inode *ip, + xfs_filblks_t *count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + s64 len = *count_fsb; + int error = -ENOSPC; + + spin_lock(&zi->zi_reservation_lock); + len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + if (len > 0) { + *count_fsb = len; + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb, + flags & XFS_ZR_RESERVED); + } + spin_unlock(&zi->zi_reservation_lock); + return error; +} + +int +xfs_zoned_space_reserve( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + struct xfs_mount *mp = ip->i_mount; + int error; + + ASSERT(ac->reserved_blocks == 0); + ASSERT(ac->open_zone == NULL); + + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, + flags & XFS_ZR_RESERVED); + if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) + error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags); + if (error) + return error; + + error = xfs_zoned_reserve_available(ip, count_fsb, flags); + if (error) { + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb); + return error; + } + ac->reserved_blocks = count_fsb; + return 0; +} + +void +xfs_zoned_space_unreserve( + struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac) +{ + if (ac->reserved_blocks > 0) { + struct xfs_mount *mp = ip->i_mount; + + xfs_zoned_add_available(mp, ac->reserved_blocks); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks); + } + if (ac->open_zone) + xfs_open_zone_put(ac->open_zone); +} diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 35166c92420c..42e2c0065bb3 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); |