diff options
Diffstat (limited to 'fs')
51 files changed, 726 insertions, 332 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index b177fd3b1eb3..be5768949cb1 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -655,6 +655,8 @@ const struct file_operations v9fs_cached_file_operations = { .release = v9fs_dir_release, .lock = v9fs_file_lock, .mmap = v9fs_file_mmap, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, }; @@ -667,6 +669,8 @@ const struct file_operations v9fs_cached_file_operations_dotl = { .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, .mmap = v9fs_file_mmap, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, }; @@ -678,6 +682,8 @@ const struct file_operations v9fs_file_operations = { .release = v9fs_dir_release, .lock = v9fs_file_lock, .mmap = generic_file_readonly_mmap, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, }; @@ -690,6 +696,8 @@ const struct file_operations v9fs_file_operations_dotl = { .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, .mmap = generic_file_readonly_mmap, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, }; @@ -701,6 +709,8 @@ const struct file_operations v9fs_mmap_file_operations = { .release = v9fs_dir_release, .lock = v9fs_file_lock, .mmap = v9fs_mmap_file_mmap, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, }; @@ -713,5 +723,7 @@ const struct file_operations v9fs_mmap_file_operations_dotl = { .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, .mmap = v9fs_mmap_file_mmap, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, }; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 1bb5b9d7f0a2..9068d5578a26 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -823,6 +823,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, vp->cb_break_before = afs_calc_vnode_cb_break(vnode); vp->vnode = vnode; vp->put_vnode = true; + vp->speculative = true; /* vnode not locked */ } } } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0fe8844b4bee..b0d7b892090d 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -294,6 +294,13 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } } else if (vp->scb.have_status) { + if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version && + vp->speculative) + /* Ignore the result of a speculative bulk status fetch + * if it splits around a modification op, thereby + * appearing to regress the data version. + */ + goto out; afs_apply_status(op, vp); if (vp->scb.have_cb) afs_apply_callback(op, vp); @@ -305,6 +312,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v } } +out: write_sequnlock(&vnode->cb_lock); if (vp->scb.have_status) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 14d5d75f4b6e..0d150a29e39e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -755,6 +755,7 @@ struct afs_vnode_param { bool update_ctime:1; /* Need to update the ctime */ bool set_size:1; /* Must update i_size */ bool op_unlinked:1; /* True if file was unlinked by op */ + bool speculative:1; /* T if speculative status fetch (no vnode lock) */ }; /* diff --git a/fs/afs/super.c b/fs/afs/super.c index 6c5900df6aa5..e38bb1e7a4d2 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -230,6 +230,9 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) _enter(",%s", name); + if (fc->source) + return invalf(fc, "kAFS: Multiple sources not supported"); + if (!name) { printk(KERN_ERR "kAFS: no volume name specified\n"); return -EINVAL; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0378933d163c..0b29bdb25105 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -878,7 +878,10 @@ struct btrfs_fs_info { */ struct ulist *qgroup_ulist; - /* protect user change for quota operations */ + /* + * Protect user change for quota operations. If a transaction is needed, + * it must be started before locking this lock. + */ struct mutex qgroup_ioctl_lock; /* list of dirty qgroups to be written at next commit */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 87355a38a654..4373da7bcc0d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -452,46 +452,6 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) } } -static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, - const u64 start, - const u64 len, - struct extent_state **cached_state) -{ - u64 search_start = start; - const u64 end = start + len - 1; - - while (search_start < end) { - const u64 search_len = end - search_start + 1; - struct extent_map *em; - u64 em_len; - int ret = 0; - - em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); - if (IS_ERR(em)) - return PTR_ERR(em); - - if (em->block_start != EXTENT_MAP_HOLE) - goto next; - - em_len = em->len; - if (em->start < search_start) - em_len -= search_start - em->start; - if (em_len > search_len) - em_len = search_len; - - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, - NULL, cached_state, GFP_NOFS); -next: - search_start = extent_map_end(em); - free_extent_map(em); - if (ret) - return ret; - } - return 0; -} - /* * after copy_from_user, pages need to be dirtied and we need to make * sure holes are created between the current EOF and the start of @@ -528,23 +488,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); - if (!btrfs_is_free_space_inode(inode)) { - if (start_pos >= isize && - !(inode->flags & BTRFS_INODE_PREALLOC)) { - /* - * There can't be any extents following eof in this case - * so just set the delalloc new bit for the range - * directly. - */ - extra_bits |= EXTENT_DELALLOC_NEW; - } else { - err = btrfs_find_new_delalloc_bytes(inode, start_pos, - num_bytes, cached); - if (err) - return err; - } - } - err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); if (err) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da58c58ef9aa..7e8d8169779d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2253,11 +2253,69 @@ static int add_pending_csums(struct btrfs_trans_handle *trans, return 0; } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); + + if (start >= i_size_read(&inode->vfs_inode) && + !(inode->flags & BTRFS_INODE_PREALLOC)) { + /* + * There can't be any extents following eof in this case so just + * set the delalloc new bit for the range directly. + */ + extra_bits |= EXTENT_DELALLOC_NEW; + } else { + int ret; + + ret = btrfs_find_new_delalloc_bytes(inode, start, + end + 1 - start, + cached_state); + if (ret) + return ret; + } + return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, cached_state); } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 77c54749f432..87bd37b70738 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/btrfs.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "transaction.h" @@ -497,13 +498,13 @@ next2: break; } out: + btrfs_free_path(path); fs_info->qgroup_flags |= flags; if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && ret >= 0) ret = qgroup_rescan_init(fs_info, rescan_progress, 0); - btrfs_free_path(path); if (ret < 0) { ulist_free(fs_info->qgroup_ulist); @@ -936,6 +937,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_key found_key; struct btrfs_qgroup *qgroup = NULL; struct btrfs_trans_handle *trans = NULL; + struct ulist *ulist = NULL; int ret = 0; int slot; @@ -943,8 +945,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) if (fs_info->quota_root) goto out; - fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); - if (!fs_info->qgroup_ulist) { + ulist = ulist_alloc(GFP_KERNEL); + if (!ulist) { ret = -ENOMEM; goto out; } @@ -952,6 +954,22 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; + + /* + * Unlock qgroup_ioctl_lock before starting the transaction. This is to + * avoid lock acquisition inversion problems (reported by lockdep) between + * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we + * start a transaction. + * After we started the transaction lock qgroup_ioctl_lock again and + * check if someone else created the quota root in the meanwhile. If so, + * just return success and release the transaction handle. + * + * Also we don't need to worry about someone else calling + * btrfs_sysfs_add_qgroups() after we unlock and getting an error because + * that function returns 0 (success) when the sysfs entries already exist. + */ + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item @@ -961,12 +979,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) * would be a lot of overkill. */ trans = btrfs_start_transaction(tree_root, 2); + + mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } + if (fs_info->quota_root) + goto out; + + fs_info->qgroup_ulist = ulist; + ulist = NULL; + /* * initially create the quota tree */ @@ -1124,11 +1150,14 @@ out: if (ret) { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; - if (trans) - btrfs_end_transaction(trans); btrfs_sysfs_del_qgroups(fs_info); } mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (ret && trans) + btrfs_end_transaction(trans); + else if (trans) + ret = btrfs_end_transaction(trans); + ulist_free(ulist); return ret; } @@ -1141,19 +1170,29 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; + mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in * btrfs_clean_quota_tree but this is not done. + * + * Also, we must always start a transaction without holding the mutex + * qgroup_ioctl_lock, see btrfs_quota_enable(). */ trans = btrfs_start_transaction(fs_info->tree_root, 1); + + mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + trans = NULL; goto out; } + if (!fs_info->quota_root) + goto out; + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); @@ -1167,13 +1206,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) ret = btrfs_clean_quota_tree(trans, quota_root); if (ret) { btrfs_abort_transaction(trans, ret); - goto end_trans; + goto out; } ret = btrfs_del_root(trans, "a_root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); - goto end_trans; + goto out; } list_del("a_root->dirty_list); @@ -1185,10 +1224,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_put_root(quota_root); -end_trans: - ret = btrfs_end_transaction(trans); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (ret && trans) + btrfs_end_transaction(trans); + else if (trans) + ret = btrfs_end_transaction(trans); + return ret; } @@ -1324,13 +1366,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; struct ulist *tmp; + unsigned int nofs_flag; int ret = 0; /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; + /* We hold a transaction handle open, must do a NOFS allocation. */ + nofs_flag = memalloc_nofs_save(); tmp = ulist_alloc(GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!tmp) return -ENOMEM; @@ -1387,10 +1433,14 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, struct btrfs_qgroup_list *list; struct ulist *tmp; bool found = false; + unsigned int nofs_flag; int ret = 0; int ret2; + /* We hold a transaction handle open, must do a NOFS allocation. */ + nofs_flag = memalloc_nofs_save(); tmp = ulist_alloc(GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!tmp) return -ENOMEM; @@ -3512,6 +3562,7 @@ static int try_flush_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle *trans; int ret; + bool can_commit = true; /* * We don't want to run flush again and again, so if there is a running @@ -3523,6 +3574,20 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } + /* + * If current process holds a transaction, we shouldn't flush, as we + * assume all space reservation happens before a transaction handle is + * held. + * + * But there are cases like btrfs_delayed_item_reserve_metadata() where + * we try to reserve space with one transction handle already held. + * In that case we can't commit transaction, but at least try to end it + * and hope the started data writes can free some space. + */ + if (current->journal_info && + current->journal_info != BTRFS_SEND_TRANS_STUB) + can_commit = false; + ret = btrfs_start_delalloc_snapshot(root); if (ret < 0) goto out; @@ -3534,7 +3599,10 @@ static int try_flush_qgroup(struct btrfs_root *root) goto out; } - ret = btrfs_commit_transaction(trans); + if (can_commit) + ret = btrfs_commit_transaction(trans); + else + ret = btrfs_end_transaction(trans); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); wake_up(&root->qgroup_flush_wait); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index e6719f7db386..04022069761d 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -983,7 +983,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1050,7 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1082,7 +1084,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1097,7 +1100,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_UPTODATE, 0, 0, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 8784b74f5232..ea2bb4cb5890 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1068,6 +1068,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, "invalid root item size, have %u expect %zu or %u", btrfs_item_size_nr(leaf, slot), sizeof(ri), btrfs_legacy_root_item_size()); + return -EUCLEAN; } /* @@ -1423,6 +1424,7 @@ static int check_extent_data_ref(struct extent_buffer *leaf, "invalid item size, have %u expect aligned to %zu for key type %u", btrfs_item_size_nr(leaf, slot), sizeof(*dref), key->type); + return -EUCLEAN; } if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { generic_err(leaf, slot, @@ -1451,6 +1453,7 @@ static int check_extent_data_ref(struct extent_buffer *leaf, extent_err(leaf, slot, "invalid extent data backref offset, have %llu expect aligned to %u", offset, leaf->fs_info->sectorsize); + return -EUCLEAN; } } return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a6406b3b8c2b..78637665166e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -940,7 +940,13 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (device->bdev != path_bdev) { bdput(path_bdev); mutex_unlock(&fs_devices->device_list_mutex); - btrfs_warn_in_rcu(device->fs_info, + /* + * device->fs_info may not be reliable here, so + * pass in a NULL instead. This avoids a + * possible use-after-free when the fs_info and + * fs_info->sb are already torn down. + */ + btrfs_warn_in_rcu(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, current->comm, diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 23b21e943652..ef4784e72b1d 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1266,6 +1266,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); } else if (mode_from_special_sid) { rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, true); + kfree(pntsd); } else { /* get approximated mode from ACL */ rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, false); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index c38156f324dd..44f9cce57099 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -876,6 +876,8 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) list_del_init(&server->tcp_ses_list); spin_unlock(&cifs_tcp_ses_lock); + cancel_delayed_work_sync(&server->echo); + spin_lock(&GlobalMid_Lock); server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); @@ -4544,7 +4546,8 @@ static void set_root_ses(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, if (ses) { spin_lock(&cifs_tcp_ses_lock); ses->ses_count++; - ses->tcon_ipc->remap = cifs_remap(cifs_sb); + if (ses->tcon_ipc) + ses->tcon_ipc->remap = cifs_remap(cifs_sb); spin_unlock(&cifs_tcp_ses_lock); } *root_ses = ses; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 504766cb6c19..3d914d7d0d11 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -264,7 +264,7 @@ smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) } static struct mid_q_entry * -smb2_find_mid(struct TCP_Server_Info *server, char *buf) +__smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue) { struct mid_q_entry *mid; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; @@ -281,6 +281,10 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) (mid->mid_state == MID_REQUEST_SUBMITTED) && (mid->command == shdr->Command)) { kref_get(&mid->refcount); + if (dequeue) { + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } spin_unlock(&GlobalMid_Lock); return mid; } @@ -289,6 +293,18 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) return NULL; } +static struct mid_q_entry * +smb2_find_mid(struct TCP_Server_Info *server, char *buf) +{ + return __smb2_find_mid(server, buf, false); +} + +static struct mid_q_entry * +smb2_find_dequeue_mid(struct TCP_Server_Info *server, char *buf) +{ + return __smb2_find_mid(server, buf, true); +} + static void smb2_dump_detail(void *buf, struct TCP_Server_Info *server) { @@ -3098,8 +3114,8 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; rc = SMB2_ioctl_init(tcon, server, - &rqst[1], fid.persistent_fid, - fid.volatile_fid, FSCTL_GET_REPARSE_POINT, + &rqst[1], COMPOUND_FID, + COMPOUND_FID, FSCTL_GET_REPARSE_POINT, true /* is_fctl */, NULL, 0, CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - @@ -4356,7 +4372,8 @@ init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size, static int handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, char *buf, unsigned int buf_len, struct page **pages, - unsigned int npages, unsigned int page_data_size) + unsigned int npages, unsigned int page_data_size, + bool is_offloaded) { unsigned int data_offset; unsigned int data_len; @@ -4378,7 +4395,8 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { - cifs_reconnect(server); + if (!is_offloaded) + cifs_reconnect(server); return -1; } @@ -4402,7 +4420,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); /* normal error on read response */ - dequeue_mid(mid, false); + if (is_offloaded) + mid->mid_state = MID_RESPONSE_RECEIVED; + else + dequeue_mid(mid, false); return 0; } @@ -4426,7 +4447,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n", __func__, data_offset); rdata->result = -EIO; - dequeue_mid(mid, rdata->result); + if (is_offloaded) + mid->mid_state = MID_RESPONSE_MALFORMED; + else + dequeue_mid(mid, rdata->result); return 0; } @@ -4442,21 +4466,30 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, cifs_dbg(FYI, "%s: data offset (%u) beyond 1st page of response\n", __func__, data_offset); rdata->result = -EIO; - dequeue_mid(mid, rdata->result); + if (is_offloaded) + mid->mid_state = MID_RESPONSE_MALFORMED; + else + dequeue_mid(mid, rdata->result); return 0; } if (data_len > page_data_size - pad_len) { /* data_len is corrupt -- discard frame */ rdata->result = -EIO; - dequeue_mid(mid, rdata->result); + if (is_offloaded) + mid->mid_state = MID_RESPONSE_MALFORMED; + else + dequeue_mid(mid, rdata->result); return 0; } rdata->result = init_read_bvec(pages, npages, page_data_size, cur_off, &bvec); if (rdata->result != 0) { - dequeue_mid(mid, rdata->result); + if (is_offloaded) + mid->mid_state = MID_RESPONSE_MALFORMED; + else + dequeue_mid(mid, rdata->result); return 0; } @@ -4471,7 +4504,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); rdata->result = -EIO; - dequeue_mid(mid, rdata->result); + if (is_offloaded) + mid->mid_state = MID_RESPONSE_MALFORMED; + else + dequeue_mid(mid, rdata->result); return 0; } @@ -4482,7 +4518,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, if (length < 0) return length; - dequeue_mid(mid, false); + if (is_offloaded) + mid->mid_state = MID_RESPONSE_RECEIVED; + else + dequeue_mid(mid, false); return length; } @@ -4511,15 +4550,34 @@ static void smb2_decrypt_offload(struct work_struct *work) } dw->server->lstrp = jiffies; - mid = smb2_find_mid(dw->server, dw->buf); + mid = smb2_find_dequeue_mid(dw->server, dw->buf); if (mid == NULL) cifs_dbg(FYI, "mid not found\n"); else { mid->decrypted = true; rc = handle_read_data(dw->server, mid, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len); - mid->callback(mid); + dw->ppages, dw->npages, dw->len, + true); + if (rc >= 0) { +#ifdef CONFIG_CIFS_STATS2 + mid->when_received = jiffies; +#endif + mid->callback(mid); + } else { + spin_lock(&GlobalMid_Lock); + if (dw->server->tcpStatus == CifsNeedReconnect) { + mid->mid_state = MID_RETRY_NEEDED; + spin_unlock(&GlobalMid_Lock); + mid->callback(mid); + } else { + mid->mid_state = MID_REQUEST_SUBMITTED; + mid->mid_flags &= ~(MID_DELETED); + list_add_tail(&mid->qhead, + &dw->server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); + } + } cifs_mid_q_entry_release(mid); } @@ -4622,7 +4680,7 @@ non_offloaded_decrypt: (*mid)->decrypted = true; rc = handle_read_data(server, *mid, buf, server->vals->read_rsp_size, - pages, npages, len); + pages, npages, len, false); } free_pages: @@ -4765,7 +4823,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) char *buf = server->large_buf ? server->bigbuf : server->smallbuf; return handle_read_data(server, mid, buf, server->pdu_size, - NULL, 0, 0); + NULL, 0, 0, false); } static int diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 445e80862865..acb72705062d 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2272,17 +2272,15 @@ static struct crt_sd_ctxt * create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) { struct crt_sd_ctxt *buf; - struct cifs_ace *pace; - unsigned int sdlen, acelen; + __u8 *ptr, *aclptr; + unsigned int acelen, acl_size, ace_count; unsigned int owner_offset = 0; unsigned int group_offset = 0; + struct smb3_acl acl; - *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 2), 8); + *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); if (set_owner) { - /* offset fields are from beginning of security descriptor not of create context */ - owner_offset = sizeof(struct smb3_acl) + (sizeof(struct cifs_ace) * 2); - /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */ *len += sizeof(struct owner_group_sids); } @@ -2291,26 +2289,22 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) if (buf == NULL) return buf; + ptr = (__u8 *)&buf[1]; if (set_owner) { + /* offset fields are from beginning of security descriptor not of create context */ + owner_offset = ptr - (__u8 *)&buf->sd; buf->sd.OffsetOwner = cpu_to_le32(owner_offset); - group_offset = owner_offset + sizeof(struct owner_sid); + group_offset = owner_offset + offsetof(struct owner_group_sids, group); buf->sd.OffsetGroup = cpu_to_le32(group_offset); + + setup_owner_group_sids(ptr); + ptr += sizeof(struct owner_group_sids); } else { buf->sd.OffsetOwner = 0; buf->sd.OffsetGroup = 0; } - sdlen = sizeof(struct smb3_sd) + sizeof(struct smb3_acl) + - 2 * sizeof(struct cifs_ace); - if (set_owner) { - sdlen += sizeof(struct owner_group_sids); - setup_owner_group_sids(owner_offset + sizeof(struct create_context) + 8 /* name */ - + (char *)buf); - } - - buf->ccontext.DataOffset = cpu_to_le16(offsetof - (struct crt_sd_ctxt, sd)); - buf->ccontext.DataLength = cpu_to_le32(sdlen); + buf->ccontext.DataOffset = cpu_to_le16(offsetof(struct crt_sd_ctxt, sd)); buf->ccontext.NameOffset = cpu_to_le16(offsetof(struct crt_sd_ctxt, Name)); buf->ccontext.NameLength = cpu_to_le16(4); /* SMB2_CREATE_SD_BUFFER_TOKEN is "SecD" */ @@ -2319,6 +2313,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) buf->Name[2] = 'c'; buf->Name[3] = 'D'; buf->sd.Revision = 1; /* Must be one see MS-DTYP 2.4.6 */ + /* * ACL is "self relative" ie ACL is stored in contiguous block of memory * and "DP" ie the DACL is present @@ -2326,28 +2321,38 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) buf->sd.Control = cpu_to_le16(ACL_CONTROL_SR | ACL_CONTROL_DP); /* offset owner, group and Sbz1 and SACL are all zero */ - buf->sd.OffsetDacl = cpu_to_le32(sizeof(struct smb3_sd)); - buf->acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */ + buf->sd.OffsetDacl = cpu_to_le32(ptr - (__u8 *)&buf->sd); + /* Ship the ACL for now. we will copy it into buf later. */ + aclptr = ptr; + ptr += sizeof(struct cifs_acl); /* create one ACE to hold the mode embedded in reserved special SID */ - pace = (struct cifs_ace *)(sizeof(struct crt_sd_ctxt) + (char *)buf); - acelen = setup_special_mode_ACE(pace, (__u64)mode); + acelen = setup_special_mode_ACE((struct cifs_ace *)ptr, (__u64)mode); + ptr += acelen; + acl_size = acelen + sizeof(struct smb3_acl); + ace_count = 1; if (set_owner) { /* we do not need to reallocate buffer to add the two more ACEs. plenty of space */ - pace = (struct cifs_ace *)(acelen + (sizeof(struct crt_sd_ctxt) + (char *)buf)); - acelen += setup_special_user_owner_ACE(pace); - /* it does not appear necessary to add an ACE for the NFS group SID */ - buf->acl.AceCount = cpu_to_le16(3); - } else - buf->acl.AceCount = cpu_to_le16(2); + acelen = setup_special_user_owner_ACE((struct cifs_ace *)ptr); + ptr += acelen; + acl_size += acelen; + ace_count += 1; + } /* and one more ACE to allow access for authenticated users */ - pace = (struct cifs_ace *)(acelen + (sizeof(struct crt_sd_ctxt) + - (char *)buf)); - acelen += setup_authusers_ACE(pace); - - buf->acl.AclSize = cpu_to_le16(sizeof(struct cifs_acl) + acelen); + acelen = setup_authusers_ACE((struct cifs_ace *)ptr); + ptr += acelen; + acl_size += acelen; + ace_count += 1; + + acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */ + acl.AclSize = cpu_to_le16(acl_size); + acl.AceCount = cpu_to_le16(ace_count); + memcpy(aclptr, &acl, sizeof(struct cifs_acl)); + + buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); + *len = ptr - (__u8 *)buf; return buf; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f05f9b12f689..fa57b03ca98c 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -963,8 +963,6 @@ struct crt_sd_ctxt { struct create_context ccontext; __u8 Name[8]; struct smb3_sd sd; - struct smb3_acl acl; - /* Followed by at least 4 ACEs */ } __packed; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index e27e255d40dd..36b2ece43403 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -339,8 +339,8 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, return -EAGAIN; if (signal_pending(current)) { - cifs_dbg(FYI, "signal is pending before sending any data\n"); - return -EINTR; + cifs_dbg(FYI, "signal pending before send request\n"); + return -ERESTARTSYS; } /* cork the socket */ diff --git a/fs/coredump.c b/fs/coredump.c index 0cd9056d79cc..c6acfc694f65 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -229,7 +229,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, */ if (ispipe) { if (isspace(*pat_ptr)) { - was_space = true; + if (cn->used != 0) + was_space = true; pat_ptr++; continue; } else if (was_space) { diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 96c0c86f3fff..0297ad95eb5c 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -7,6 +7,7 @@ #include <linux/efi.h> #include <linux/fs.h> #include <linux/ctype.h> +#include <linux/kmemleak.h> #include <linux/slab.h> #include <linux/uuid.h> @@ -103,6 +104,7 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, var->var.VariableName[i] = '\0'; inode->i_private = var; + kmemleak_ignore(var); err = efivar_entry_add(var, &efivarfs_list); if (err) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf9429484462..65ecaf96d0a4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2695,7 +2695,8 @@ void ext4_insert_dentry(struct inode *inode, struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { - if (!ext4_has_feature_dir_index(inode->i_sb)) { + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { /* ext4_iget() should have caught this... */ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6633b20224d5..94472044f4c1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2638,10 +2638,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } - - if (test_opt2(sb, JOURNAL_FAST_COMMIT)) - SEQ_OPTS_PUTS("fast_commit"); - ext4_show_quota_options(seq, sb); return 0; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d98a2e5dab9f..35a6fd103761 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1035,6 +1035,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_node.next = NULL; gl->gl_flags = 0; gl->gl_name = name; + lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); gl->gl_lockref.count = 1; gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 6c1432d78dce..3faa421568b0 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -245,7 +245,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl, const char *fs_id_buf) { - struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + struct gfs2_rgrpd *rgd = gl->gl_object; if (rgd) gfs2_rgrp_dump(seq, rgd, fs_id_buf); @@ -571,7 +571,19 @@ static int freeze_go_sync(struct gfs2_glock *gl) int error = 0; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - if (gl->gl_req == LM_ST_EXCLUSIVE && !gfs2_withdrawn(sdp)) { + /* + * We need to check gl_state == LM_ST_SHARED here and not gl_req == + * LM_ST_EXCLUSIVE. That's because when any node does a freeze, + * all the nodes should have the freeze glock in SH mode and they all + * call do_xmote: One for EX and the others for UN. They ALL must + * freeze locally, and they ALL must queue freeze work. The freeze_work + * calls freeze_func, which tries to reacquire the freeze glock in SH, + * effectively waiting for the thaw on the node who holds it in EX. + * Once thawed, the work func acquires the freeze glock in + * SH and everybody goes back to thawed. + */ + if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) && + !test_bit(SDF_NORECOVERY, &sdp->sd_flags)) { atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE); error = freeze_super(sdp->sd_vfs); if (error) { @@ -770,6 +782,7 @@ const struct gfs2_glock_operations gfs2_iopen_glops = { .go_callback = iopen_go_callback, .go_demote_ok = iopen_go_demote_ok, .go_flags = GLOF_LRU | GLOF_NONDISK, + .go_subclass = 1, }; const struct gfs2_glock_operations gfs2_flock_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index d7707307f4b1..f8858d995b24 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -247,6 +247,7 @@ struct gfs2_glock_operations { const char *fs_id_buf); void (*go_callback)(struct gfs2_glock *gl, bool remote); void (*go_free)(struct gfs2_glock *gl); + const int go_subclass; const int go_type; const unsigned long go_flags; #define GLOF_ASPACE 1 /* address space attached */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 077ccb1b3ccc..65ae4fc28ede 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -150,6 +150,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) goto fail; + if (blktype != GFS2_BLKST_UNLINKED) + gfs2_cancel_delete_work(io_gl); if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) { /* @@ -180,8 +182,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail; - if (blktype != GFS2_BLKST_UNLINKED) - gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -725,13 +725,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, flush_delayed_work(&ip->i_gl->gl_work); glock_set_object(ip->i_gl, ip); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) goto fail_free_inode; + gfs2_cancel_delete_work(io_gl); + glock_set_object(io_gl, ip); + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_gunlock2; error = gfs2_trans_begin(sdp, blocks, 0); if (error) - goto fail_free_inode; + goto fail_gunlock2; if (blocks > 1) { ip->i_eattr = ip->i_no_addr + 1; @@ -740,18 +746,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, init_dinode(dip, ip, symname); gfs2_trans_end(sdp); - error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (error) - goto fail_free_inode; - BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags)); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (error) goto fail_gunlock2; - gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); - glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_set_iop(inode); insert_inode_hash(inode); @@ -803,6 +803,7 @@ fail_gunlock3: gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_gunlock2: clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + glock_clear_object(io_gl, ip); gfs2_glock_put(io_gl); fail_free_inode: if (ip->i_gl) { @@ -2116,6 +2117,25 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset) return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); } +static int gfs2_update_time(struct inode *inode, struct timespec64 *time, + int flags) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *gl = ip->i_gl; + struct gfs2_holder *gh; + int error; + + gh = gfs2_glock_is_locked_by_me(gl); + if (gh && !gfs2_glock_is_held_excl(gl)) { + gfs2_glock_dq(gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, gh); + error = gfs2_glock_nq(gh); + if (error) + return error; + } + return generic_update_time(inode, time, flags); +} + const struct inode_operations gfs2_file_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, @@ -2124,6 +2144,7 @@ const struct inode_operations gfs2_file_iops = { .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, + .update_time = gfs2_update_time, }; const struct inode_operations gfs2_dir_iops = { @@ -2143,6 +2164,7 @@ const struct inode_operations gfs2_dir_iops = { .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, + .update_time = gfs2_update_time, .atomic_open = gfs2_atomic_open, }; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index f7addc6197ed..5e8eef9990e3 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -985,6 +985,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip) if (error < 0) return error; + if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) { + fs_err(sdp, "no resource groups found in the file system.\n"); + return -ENOENT; + } set_rgrp_preferences(sdp); sdp->sd_rindex_uptodate = 1; diff --git a/fs/io_uring.c b/fs/io_uring.c index 4ead291b2976..86dac2b2e276 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -205,6 +205,7 @@ struct fixed_file_ref_node { struct list_head file_list; struct fixed_file_data *file_data; struct llist_node llist; + bool done; }; struct fixed_file_data { @@ -478,6 +479,7 @@ struct io_sr_msg { struct io_open { struct file *file; int dfd; + bool ignore_nonblock; struct filename *filename; struct open_how how; unsigned long nofile; @@ -1282,7 +1284,7 @@ static bool io_identity_cow(struct io_kiocb *req) */ io_init_identity(id); if (creds) - req->work.identity->creds = creds; + id->creds = creds; /* add one for this request */ refcount_inc(&id->count); @@ -1311,22 +1313,6 @@ static bool io_grab_identity(struct io_kiocb *req) return false; req->work.flags |= IO_WQ_WORK_FSIZE; } - - if (!(req->work.flags & IO_WQ_WORK_FILES) && - (def->work_flags & IO_WQ_WORK_FILES) && - !(req->flags & REQ_F_NO_FILE_TABLE)) { - if (id->files != current->files || - id->nsproxy != current->nsproxy) - return false; - atomic_inc(&id->files->count); - get_nsproxy(id->nsproxy); - req->flags |= REQ_F_INFLIGHT; - - spin_lock_irq(&ctx->inflight_lock); - list_add(&req->inflight_entry, &ctx->inflight_list); - spin_unlock_irq(&ctx->inflight_lock); - req->work.flags |= IO_WQ_WORK_FILES; - } #ifdef CONFIG_BLK_CGROUP if (!(req->work.flags & IO_WQ_WORK_BLKCG) && (def->work_flags & IO_WQ_WORK_BLKCG)) { @@ -1368,6 +1354,21 @@ static bool io_grab_identity(struct io_kiocb *req) } spin_unlock(¤t->fs->lock); } + if (!(req->work.flags & IO_WQ_WORK_FILES) && + (def->work_flags & IO_WQ_WORK_FILES) && + !(req->flags & REQ_F_NO_FILE_TABLE)) { + if (id->files != current->files || + id->nsproxy != current->nsproxy) + return false; + atomic_inc(&id->files->count); + get_nsproxy(id->nsproxy); + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + req->work.flags |= IO_WQ_WORK_FILES; + } return true; } @@ -2577,7 +2578,6 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) } end_req: req_set_fail_links(req); - io_req_complete(req, ret); return false; } #endif @@ -3192,7 +3192,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, rw->free_iovec = iovec; rw->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ - if (iter->type == ITER_BVEC) + if (iov_iter_is_bvec(iter)) return; if (!iovec) { unsigned iov_off = 0; @@ -3795,6 +3795,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return ret; } req->open.nofile = rlimit(RLIMIT_NOFILE); + req->open.ignore_nonblock = false; req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -3838,7 +3839,7 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock) struct file *file; int ret; - if (force_nonblock) + if (force_nonblock && !req->open.ignore_nonblock) return -EAGAIN; ret = build_open_flags(&req->open.how, &op); @@ -3853,6 +3854,21 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock) if (IS_ERR(file)) { put_unused_fd(ret); ret = PTR_ERR(file); + /* + * A work-around to ensure that /proc/self works that way + * that it should - if we get -EOPNOTSUPP back, then assume + * that proc_self_get_link() failed us because we're in async + * context. We should be safe to retry this from the task + * itself with force_nonblock == false set, as it should not + * block on lookup. Would be nice to know this upfront and + * avoid the async dance, but doesn't seem feasible. + */ + if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) { + req->open.ignore_nonblock = true; + refcount_inc(&req->refs); + io_req_task_queue(req); + return 0; + } } else { fsnotify_open(file); fd_install(ret, file); @@ -4483,7 +4499,8 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, return -EFAULT; if (clen < 0) return -EINVAL; - sr->len = iomsg->iov[0].iov_len; + sr->len = clen; + iomsg->iov[0].iov_len = clen; iomsg->iov = NULL; } else { ret = __import_iovec(READ, (struct iovec __user *)uiov, len, @@ -6957,9 +6974,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return -ENXIO; spin_lock(&data->lock); - if (!list_empty(&data->ref_list)) - ref_node = list_first_entry(&data->ref_list, - struct fixed_file_ref_node, node); + ref_node = data->node; spin_unlock(&data->lock); if (ref_node) percpu_ref_kill(&ref_node->refs); @@ -7308,10 +7323,6 @@ static void __io_file_put_work(struct fixed_file_ref_node *ref_node) kfree(pfile); } - spin_lock(&file_data->lock); - list_del(&ref_node->node); - spin_unlock(&file_data->lock); - percpu_ref_exit(&ref_node->refs); kfree(ref_node); percpu_ref_put(&file_data->refs); @@ -7338,17 +7349,32 @@ static void io_file_put_work(struct work_struct *work) static void io_file_data_ref_zero(struct percpu_ref *ref) { struct fixed_file_ref_node *ref_node; + struct fixed_file_data *data; struct io_ring_ctx *ctx; - bool first_add; + bool first_add = false; int delay = HZ; ref_node = container_of(ref, struct fixed_file_ref_node, refs); - ctx = ref_node->file_data->ctx; + data = ref_node->file_data; + ctx = data->ctx; + + spin_lock(&data->lock); + ref_node->done = true; - if (percpu_ref_is_dying(&ctx->file_data->refs)) + while (!list_empty(&data->ref_list)) { + ref_node = list_first_entry(&data->ref_list, + struct fixed_file_ref_node, node); + /* recycle ref nodes in order */ + if (!ref_node->done) + break; + list_del(&ref_node->node); + first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); + } + spin_unlock(&data->lock); + + if (percpu_ref_is_dying(&data->refs)) delay = 0; - first_add = llist_add(&ref_node->llist, &ctx->file_put_llist); if (!delay) mod_delayed_work(system_wq, &ctx->file_put_work, 0); else if (first_add) @@ -7372,6 +7398,7 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node( INIT_LIST_HEAD(&ref_node->node); INIT_LIST_HEAD(&ref_node->file_list); ref_node->file_data = ctx->file_data; + ref_node->done = false; return ref_node; } @@ -7467,7 +7494,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, file_data->node = ref_node; spin_lock(&file_data->lock); - list_add(&ref_node->node, &file_data->ref_list); + list_add_tail(&ref_node->node, &file_data->ref_list); spin_unlock(&file_data->lock); percpu_ref_get(&file_data->refs); return ret; @@ -7626,7 +7653,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (needs_switch) { percpu_ref_kill(&data->node->refs); spin_lock(&data->lock); - list_add(&ref_node->node, &data->ref_list); + list_add_tail(&ref_node->node, &data->ref_list); data->node = ref_node; spin_unlock(&data->lock); percpu_ref_get(&ctx->file_data->refs); @@ -9156,6 +9183,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) { struct file *file; int ret; + int fd; #if defined(CONFIG_UNIX) ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, @@ -9167,12 +9195,12 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (ret < 0) goto err; + fd = ret; file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC); if (IS_ERR(file)) { -err_fd: - put_unused_fd(ret); + put_unused_fd(fd); ret = PTR_ERR(file); goto err; } @@ -9180,12 +9208,14 @@ err_fd: #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; #endif - if (unlikely(io_uring_add_task_file(ctx, file))) { - file = ERR_PTR(-ENOMEM); - goto err_fd; + ret = io_uring_add_task_file(ctx, file); + if (ret) { + fput(file); + put_unused_fd(fd); + goto err; } - fd_install(ret, file); - return ret; + fd_install(fd, file); + return fd; err: #if defined(CONFIG_UNIX) sock_release(ctx->ring_sock); @@ -9225,14 +9255,16 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ - p->cq_entries = roundup_pow_of_two(p->cq_entries); - if (p->cq_entries < p->sq_entries) + if (!p->cq_entries) return -EINVAL; if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { if (!(p->flags & IORING_SETUP_CLAMP)) return -EINVAL; p->cq_entries = IORING_MAX_CQ_ENTRIES; } + p->cq_entries = roundup_pow_of_two(p->cq_entries); + if (p->cq_entries < p->sq_entries) + return -EINVAL; } else { p->cq_entries = 2 * p->sq_entries; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0c3d5e3b24b2..188f79d76988 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -566,12 +566,14 @@ static int __jbd2_journal_force_commit(journal_t *journal) } /** - * Force and wait upon a commit if the calling process is not within - * transaction. This is used for forcing out undo-protected data which contains - * bitmaps, when the fs is running out of space. + * jbd2_journal_force_commit_nested - Force and wait upon a commit if the + * calling process is not within transaction. * * @journal: journal to force * Returns true if progress was made. + * + * This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. */ int jbd2_journal_force_commit_nested(journal_t *journal) { @@ -582,7 +584,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal) } /** - * int journal_force_commit() - force any uncommitted transactions + * jbd2_journal_force_commit() - force any uncommitted transactions * @journal: journal to force * * Caller want unconditional commit. We can only force the running transaction @@ -1881,7 +1883,7 @@ static int load_superblock(journal_t *journal) /** - * int jbd2_journal_load() - Read journal from disk. + * jbd2_journal_load() - Read journal from disk. * @journal: Journal to act on. * * Given a journal_t structure which tells us which disk blocks contain @@ -1951,7 +1953,7 @@ recovery_error: } /** - * void jbd2_journal_destroy() - Release a journal_t structure. + * jbd2_journal_destroy() - Release a journal_t structure. * @journal: Journal to act on. * * Release a journal_t structure once it is no longer in use by the @@ -2028,7 +2030,7 @@ int jbd2_journal_destroy(journal_t *journal) /** - *int jbd2_journal_check_used_features() - Check if features specified are used. + * jbd2_journal_check_used_features() - Check if features specified are used. * @journal: Journal to check. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -2063,7 +2065,7 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, } /** - * int jbd2_journal_check_available_features() - Check feature set in journalling layer + * jbd2_journal_check_available_features() - Check feature set in journalling layer * @journal: Journal to check. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -2126,7 +2128,7 @@ jbd2_journal_initialize_fast_commit(journal_t *journal) } /** - * int jbd2_journal_set_features() - Mark a given journal feature in the superblock + * jbd2_journal_set_features() - Mark a given journal feature in the superblock * @journal: Journal to act on. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -2217,7 +2219,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, } /* - * jbd2_journal_clear_features () - Clear a given journal feature in the + * jbd2_journal_clear_features() - Clear a given journal feature in the * superblock * @journal: Journal to act on. * @compat: bitmask of compatible features @@ -2246,7 +2248,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, EXPORT_SYMBOL(jbd2_journal_clear_features); /** - * int jbd2_journal_flush () - Flush journal + * jbd2_journal_flush() - Flush journal * @journal: Journal to act on. * * Flush all data for a given journal to disk and empty the journal. @@ -2321,7 +2323,7 @@ out: } /** - * int jbd2_journal_wipe() - Wipe journal contents + * jbd2_journal_wipe() - Wipe journal contents * @journal: Journal to act on. * @write: flag (see below) * @@ -2362,7 +2364,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) } /** - * void jbd2_journal_abort () - Shutdown the journal immediately. + * jbd2_journal_abort () - Shutdown the journal immediately. * @journal: the journal to shutdown. * @errno: an error number to record in the journal indicating * the reason for the shutdown. @@ -2453,7 +2455,7 @@ void jbd2_journal_abort(journal_t *journal, int errno) } /** - * int jbd2_journal_errno () - returns the journal's error state. + * jbd2_journal_errno() - returns the journal's error state. * @journal: journal to examine. * * This is the errno number set with jbd2_journal_abort(), the last @@ -2477,7 +2479,7 @@ int jbd2_journal_errno(journal_t *journal) } /** - * int jbd2_journal_clear_err () - clears the journal's error state + * jbd2_journal_clear_err() - clears the journal's error state * @journal: journal to act on. * * An error must be cleared or acked to take a FS out of readonly @@ -2497,7 +2499,7 @@ int jbd2_journal_clear_err(journal_t *journal) } /** - * void jbd2_journal_ack_err() - Ack journal err. + * jbd2_journal_ack_err() - Ack journal err. * @journal: journal to act on. * * An error must be cleared or acked to take a FS out of readonly diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d54f04674e8e..9396666b7314 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -519,7 +519,7 @@ EXPORT_SYMBOL(jbd2__journal_start); /** - * handle_t *jbd2_journal_start() - Obtain a new handle. + * jbd2_journal_start() - Obtain a new handle. * @journal: Journal to start transaction on. * @nblocks: number of block buffer we might modify * @@ -566,7 +566,7 @@ void jbd2_journal_free_reserved(handle_t *handle) EXPORT_SYMBOL(jbd2_journal_free_reserved); /** - * int jbd2_journal_start_reserved() - start reserved handle + * jbd2_journal_start_reserved() - start reserved handle * @handle: handle to start * @type: for handle statistics * @line_no: for handle statistics @@ -620,7 +620,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, EXPORT_SYMBOL(jbd2_journal_start_reserved); /** - * int jbd2_journal_extend() - extend buffer credits. + * jbd2_journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. * @revoke_records: number of revoke records to try to extend by. @@ -745,7 +745,7 @@ static void stop_this_handle(handle_t *handle) } /** - * int jbd2_journal_restart() - restart a handle . + * jbd2__journal_restart() - restart a handle . * @handle: handle to restart * @nblocks: nr credits requested * @revoke_records: number of revoke record credits requested @@ -815,7 +815,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) EXPORT_SYMBOL(jbd2_journal_restart); /** - * void jbd2_journal_lock_updates () - establish a transaction barrier. + * jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. * * This locks out any further updates from being started, and blocks @@ -874,7 +874,7 @@ void jbd2_journal_lock_updates(journal_t *journal) } /** - * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier + * jbd2_journal_unlock_updates () - release barrier * @journal: Journal to release the barrier on. * * Release a transaction barrier obtained with jbd2_journal_lock_updates(). @@ -1182,7 +1182,8 @@ out: } /** - * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. + * jbd2_journal_get_write_access() - notify intent to modify a buffer + * for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes * @@ -1226,7 +1227,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) * unlocked buffer beforehand. */ /** - * int jbd2_journal_get_create_access () - notify intent to use newly created bh + * jbd2_journal_get_create_access () - notify intent to use newly created bh * @handle: transaction to new buffer to * @bh: new buffer. * @@ -1306,7 +1307,7 @@ out: } /** - * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with + * jbd2_journal_get_undo_access() - Notify intent to modify metadata with * non-rewindable consequences * @handle: transaction * @bh: buffer to undo @@ -1383,7 +1384,7 @@ out: } /** - * void jbd2_journal_set_triggers() - Add triggers for commit writeout + * jbd2_journal_set_triggers() - Add triggers for commit writeout * @bh: buffer to trigger on * @type: struct jbd2_buffer_trigger_type containing the trigger(s). * @@ -1425,7 +1426,7 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, } /** - * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata + * jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark * @@ -1593,7 +1594,7 @@ out: } /** - * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. + * jbd2_journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle * @bh: bh to 'forget' * @@ -1762,7 +1763,7 @@ drop: } /** - * int jbd2_journal_stop() - complete a transaction + * jbd2_journal_stop() - complete a transaction * @handle: transaction to complete. * * All done for a particular handle. @@ -2080,7 +2081,7 @@ out: } /** - * int jbd2_journal_try_to_free_buffers() - try to free page buffers. + * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free * @@ -2411,7 +2412,7 @@ zap_buffer_unlocked: } /** - * void jbd2_journal_invalidatepage() + * jbd2_journal_invalidatepage() * @journal: journal to use for flush... * @page: page to flush * @offset: start of the range to invalidate diff --git a/fs/libfs.c b/fs/libfs.c index fc34361c1489..7124c2e8df2f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -959,7 +959,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { struct simple_attr *attr; - u64 val; + unsigned long long val; size_t size; ssize_t ret; @@ -977,7 +977,9 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, goto out; attr->set_buf[size] = '\0'; - val = simple_strtoll(attr->set_buf, NULL, 0); + ret = kstrtoull(attr->set_buf, 0, &val); + if (ret) + goto out; ret = attr->set(attr->data, val); if (ret == 0) ret = len; /* on success, claim we got the whole input */ diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 88e1763e02f3..e2a488d403a6 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -205,3 +205,12 @@ config NFS_DISABLE_UDP_SUPPORT Choose Y here to disable the use of NFS over UDP. NFS over UDP on modern networks (1Gb+) can lead to data corruption caused by fragmentation during high loads. + +config NFS_V4_2_READ_PLUS + bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" + depends on NFS_V4_2 + default n + help + This is intended for developers only. The READ_PLUS operation has + been shown to have issues under specific conditions and should not + be used in production. diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index a163533446fa..24bf5797f88a 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -838,7 +838,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_pgio_mirror *pgm; struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - u32 ds_idx, i; + u32 ds_idx; retry: ff_layout_pg_check_layout(pgio, req); @@ -864,11 +864,9 @@ retry: goto retry; } - for (i = 0; i < pgio->pg_mirror_count; i++) { - mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); - pgm = &pgio->pg_mirrors[i]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; - } + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); + pgm = &pgio->pg_mirrors[0]; + pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; pgio->pg_mirror_idx = ds_idx; @@ -985,6 +983,21 @@ out: return 1; } +static u32 +ff_layout_pg_set_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx) +{ + u32 old = desc->pg_mirror_idx; + + desc->pg_mirror_idx = idx; + return old; +} + +static struct nfs_pgio_mirror * +ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx) +{ + return &desc->pg_mirrors[idx]; +} + static const struct nfs_pageio_ops ff_layout_pg_read_ops = { .pg_init = ff_layout_pg_init_read, .pg_test = pnfs_generic_pg_test, @@ -998,6 +1011,8 @@ static const struct nfs_pageio_ops ff_layout_pg_write_ops = { .pg_doio = pnfs_generic_pg_writepages, .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write, .pg_cleanup = pnfs_generic_pg_cleanup, + .pg_get_mirror = ff_layout_pg_get_mirror_write, + .pg_set_mirror = ff_layout_pg_set_mirror_write, }; static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 2b2211d1234e..4fc61e3d098d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1241,12 +1241,13 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, .rpc_resp = &res, }; u32 xdrlen; - int ret, np; + int ret, np, i; + ret = -ENOMEM; res.scratch = alloc_page(GFP_KERNEL); if (!res.scratch) - return -ENOMEM; + goto out; xdrlen = nfs42_listxattr_xdrsize(buflen); if (xdrlen > server->lxasize) @@ -1254,9 +1255,12 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, np = xdrlen / PAGE_SIZE + 1; pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { - __free_page(res.scratch); - return -ENOMEM; + if (!pages) + goto out_free_scratch; + for (i = 0; i < np; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free_pages; } arg.xattr_pages = pages; @@ -1271,14 +1275,15 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, *eofp = res.eof; } +out_free_pages: while (--np >= 0) { if (pages[np]) __free_page(pages[np]); } - - __free_page(res.scratch); kfree(pages); - +out_free_scratch: + __free_page(res.scratch); +out: return ret; } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6e060a88f98c..8432bd6b95f0 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1528,7 +1528,6 @@ static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req, rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, hdr.replen); - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; encode_nops(&hdr); } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 9d354de613da..57b3821d975a 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -377,10 +377,10 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, goto out_stateowner; set_bit(NFS_SRV_SSC_COPY_STATE, &ctx->state->flags); - set_bit(NFS_OPEN_STATE, &ctx->state->flags); memcpy(&ctx->state->open_stateid.other, &stateid->other, NFS4_STATEID_OTHER_SIZE); update_open_stateid(ctx->state, stateid, NULL, filep->f_mode); + set_bit(NFS_OPEN_STATE, &ctx->state->flags); nfs_file_set_open_context(filep, ctx); put_nfs_open_context(ctx); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9e0ca9b2b210..e89468678ae1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5309,7 +5309,7 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) nfs4_read_done_cb(task, hdr); } -#ifdef CONFIG_NFS_V4_2 +#if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg) { if (server->caps & NFS_CAP_READ_PLUS) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6985cacf4700..78c9c4bdef2b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -31,13 +31,29 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; +static struct nfs_pgio_mirror * +nfs_pgio_get_mirror(struct nfs_pageio_descriptor *desc, u32 idx) +{ + if (desc->pg_ops->pg_get_mirror) + return desc->pg_ops->pg_get_mirror(desc, idx); + return &desc->pg_mirrors[0]; +} + struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) { - return &desc->pg_mirrors[desc->pg_mirror_idx]; + return nfs_pgio_get_mirror(desc, desc->pg_mirror_idx); } EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror); +static u32 +nfs_pgio_set_current_mirror(struct nfs_pageio_descriptor *desc, u32 idx) +{ + if (desc->pg_ops->pg_set_mirror) + return desc->pg_ops->pg_set_mirror(desc, idx); + return desc->pg_mirror_idx; +} + void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)) @@ -1259,7 +1275,7 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc) return; for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; + mirror = nfs_pgio_get_mirror(desc, midx); desc->pg_completion_ops->error_cleanup(&mirror->pg_list, desc->pg_error); } @@ -1293,12 +1309,12 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, goto out_failed; } - desc->pg_mirror_idx = midx; + nfs_pgio_set_current_mirror(desc, midx); if (!nfs_pageio_add_request_mirror(desc, dupreq)) goto out_cleanup_subreq; } - desc->pg_mirror_idx = 0; + nfs_pgio_set_current_mirror(desc, 0); if (!nfs_pageio_add_request_mirror(desc, req)) goto out_failed; @@ -1320,10 +1336,12 @@ out_failed: static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, u32 mirror_idx) { - struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx]; - u32 restore_idx = desc->pg_mirror_idx; + struct nfs_pgio_mirror *mirror; + u32 restore_idx; + + restore_idx = nfs_pgio_set_current_mirror(desc, mirror_idx); + mirror = nfs_pgio_current_mirror(desc); - desc->pg_mirror_idx = mirror_idx; for (;;) { nfs_pageio_doio(desc); if (desc->pg_error < 0 || !mirror->pg_recoalesce) @@ -1331,7 +1349,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, if (!nfs_do_recoalesce(desc)) break; } - desc->pg_mirror_idx = restore_idx; + nfs_pgio_set_current_mirror(desc, restore_idx); } /* @@ -1405,7 +1423,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) u32 midx; for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; + mirror = nfs_pgio_get_mirror(desc, midx); if (!list_empty(&mirror->pg_list)) { prev = nfs_list_entry(mirror->pg_list.prev); if (index != prev->wb_index + 1) { diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index a960ec3a569a..8d3ad5ef2925 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -178,6 +178,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; + bool parent_needed, parent_interested; __u32 p_mask; struct inode *p_inode = NULL; struct name_snapshot name; @@ -193,7 +194,8 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, return 0; parent = NULL; - if (!parent_watched && !fsnotify_event_needs_parent(inode, mnt, mask)) + parent_needed = fsnotify_event_needs_parent(inode, mnt, mask); + if (!parent_watched && !parent_needed) goto notify; /* Does parent inode care about events on children? */ @@ -205,17 +207,17 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, /* * Include parent/name in notification either if some notification - * groups require parent info (!parent_watched case) or the parent is - * interested in this event. + * groups require parent info or the parent is interested in this event. */ - if (!parent_watched || (mask & p_mask & ALL_FSNOTIFY_EVENTS)) { + parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS; + if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); /* Notify both parent and child with child name info */ take_dentry_name_snapshot(&name, dentry); file_name = &name.name; - if (parent_watched) + if (parent_interested) mask |= FS_EVENT_ON_CHILD; } diff --git a/fs/proc/self.c b/fs/proc/self.c index 72cd69bcaf4a..cc71ce3466dc 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,6 +16,13 @@ static const char *proc_self_get_link(struct dentry *dentry, pid_t tgid = task_tgid_nr_ns(current, ns); char *name; + /* + * Not currently supported. Once we can inherit all of struct pid, + * we can allow this. + */ + if (current->flags & PF_KTHREAD) + return ERR_PTR(-EOPNOTSUPP); + if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 217aa2705d5d..ee5a235b3056 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1599,11 +1599,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, src = *ppos; svpfn = src / PM_ENTRY_BYTES; - start_vaddr = svpfn << PAGE_SHIFT; end_vaddr = mm->task_size; /* watch out for wraparound */ - if (svpfn > mm->task_size >> PAGE_SHIFT) + start_vaddr = end_vaddr; + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) + start_vaddr = untagged_addr(svpfn << PAGE_SHIFT); + + /* Ensure the address is inside the task */ + if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; /* diff --git a/fs/seq_file.c b/fs/seq_file.c index 3b20e21604e7..03a369ccd28c 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -168,12 +168,14 @@ EXPORT_SYMBOL(seq_read); ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct seq_file *m = iocb->ki_filp->private_data; - size_t size = iov_iter_count(iter); size_t copied = 0; size_t n; void *p; int err = 0; + if (!iov_iter_count(iter)) + return 0; + mutex_lock(&m->lock); /* @@ -206,36 +208,34 @@ ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (!m->buf) goto Enomem; } - /* if not empty - flush it first */ + // something left in the buffer - copy it out first if (m->count) { - n = min(m->count, size); - if (copy_to_iter(m->buf + m->from, n, iter) != n) - goto Efault; + n = copy_to_iter(m->buf + m->from, m->count, iter); m->count -= n; m->from += n; - size -= n; copied += n; - if (!size) + if (m->count) // hadn't managed to copy everything goto Done; } - /* we need at least one record in buffer */ + // get a non-empty record in the buffer m->from = 0; p = m->op->start(m, &m->index); while (1) { err = PTR_ERR(p); - if (!p || IS_ERR(p)) + if (!p || IS_ERR(p)) // EOF or an error break; err = m->op->show(m, p); - if (err < 0) + if (err < 0) // hard error break; - if (unlikely(err)) + if (unlikely(err)) // ->show() says "skip it" m->count = 0; - if (unlikely(!m->count)) { + if (unlikely(!m->count)) { // empty record p = m->op->next(m, p, &m->index); continue; } - if (m->count < m->size) + if (!seq_has_overflowed(m)) // got it goto Fill; + // need a bigger buffer m->op->stop(m, p); kvfree(m->buf); m->count = 0; @@ -244,11 +244,14 @@ ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) goto Enomem; p = m->op->start(m, &m->index); } + // EOF or an error m->op->stop(m, p); m->count = 0; goto Done; Fill: - /* they want more? let's try to get some more */ + // one non-empty record is in the buffer; if they want more, + // try to fit more in, but in any case we need to advance + // the iterator once for every record shown. while (1) { size_t offs = m->count; loff_t pos = m->index; @@ -259,30 +262,27 @@ Fill: m->op->next); m->index++; } - if (!p || IS_ERR(p)) { - err = PTR_ERR(p); + if (!p || IS_ERR(p)) // no next record for us break; - } - if (m->count >= size) + if (m->count >= iov_iter_count(iter)) break; err = m->op->show(m, p); - if (seq_has_overflowed(m) || err) { + if (err > 0) { // ->show() says "skip it" m->count = offs; - if (likely(err <= 0)) - break; + } else if (err || seq_has_overflowed(m)) { + m->count = offs; + break; } } m->op->stop(m, p); - n = min(m->count, size); - if (copy_to_iter(m->buf, n, iter) != n) - goto Efault; + n = copy_to_iter(m->buf, m->count, iter); copied += n; m->count -= n; m->from = n; Done: - if (!copied) - copied = err; - else { + if (unlikely(!copied)) { + copied = m->count ? -EFAULT : err; + } else { iocb->ki_pos += copied; m->read_pos += copied; } @@ -291,9 +291,6 @@ Done: Enomem: err = -ENOMEM; goto Done; -Efault: - err = -EFAULT; - goto Done; } EXPORT_SYMBOL(seq_read_iter); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index bb128db220ac..d6ef69ab1c67 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -515,7 +515,7 @@ xfs_attr_copy_value( *========================================================================*/ /* - * Query whether the requested number of additional bytes of extended + * Query whether the total requested number of attr fork bytes of extended * attribute space will be able to fit inline. * * Returns zero if not, else the di_forkoff fork offset to be used in the @@ -535,6 +535,12 @@ xfs_attr_shortform_bytesfit( int maxforkoff; int offset; + /* + * Check if the new size could fit at all first: + */ + if (bytes > XFS_LITINO(mp)) + return 0; + /* rounded down */ offset = (XFS_LITINO(mp) - bytes) >> 3; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 577a66381327..beb81c84a937 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -243,8 +243,8 @@ xfs_rmapbt_key_diff( else if (y > x) return -1; - x = be64_to_cpu(kp->rm_offset); - y = xfs_rmap_irec_offset_pack(rec); + x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); + y = rec->rm_offset; if (x > y) return 1; else if (y > x) @@ -275,8 +275,8 @@ xfs_rmapbt_diff_two_keys( else if (y > x) return -1; - x = be64_to_cpu(kp1->rm_offset); - y = be64_to_cpu(kp2->rm_offset); + x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); + y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); if (x > y) return 1; else if (y > x) @@ -390,8 +390,8 @@ xfs_rmapbt_keys_inorder( return 1; else if (a > b) return 0; - a = be64_to_cpu(k1->rmap.rm_offset); - b = be64_to_cpu(k2->rmap.rm_offset); + a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); + b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); if (a <= b) return 1; return 0; @@ -420,8 +420,8 @@ xfs_rmapbt_recs_inorder( return 1; else if (a > b) return 0; - a = be64_to_cpu(r1->rmap.rm_offset); - b = be64_to_cpu(r2->rmap.rm_offset); + a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); + b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); if (a <= b) return 1; return 0; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 412e2ec55e38..fed56d213a3f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -218,13 +218,13 @@ xchk_bmap_xref_rmap( * which doesn't track unwritten state. */ if (owner != XFS_RMAP_OWN_COW && - irec->br_state == XFS_EXT_UNWRITTEN && - !(rmap.rm_flags & XFS_RMAP_UNWRITTEN)) + !!(irec->br_state == XFS_EXT_UNWRITTEN) != + !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN)) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - if (info->whichfork == XFS_ATTR_FORK && - !(rmap.rm_flags & XFS_RMAP_ATTR_FORK)) + if (!!(info->whichfork == XFS_ATTR_FORK) != + !!(rmap.rm_flags & XFS_RMAP_ATTR_FORK)) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index f52a7b8256f9..debf392e0515 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -452,32 +452,41 @@ xchk_btree_check_minrecs( int level, struct xfs_btree_block *block) { - unsigned int numrecs; - int ok_level; - - numrecs = be16_to_cpu(block->bb_numrecs); + struct xfs_btree_cur *cur = bs->cur; + unsigned int root_level = cur->bc_nlevels - 1; + unsigned int numrecs = be16_to_cpu(block->bb_numrecs); /* More records than minrecs means the block is ok. */ - if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level)) + if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) return; /* - * Certain btree blocks /can/ have fewer than minrecs records. Any - * level greater than or equal to the level of the highest dedicated - * btree block are allowed to violate this constraint. - * - * For a btree rooted in a block, the btree root can have fewer than - * minrecs records. If the btree is rooted in an inode and does not - * store records in the root, the direct children of the root and the - * root itself can have fewer than minrecs records. + * For btrees rooted in the inode, it's possible that the root block + * contents spilled into a regular ondisk block because there wasn't + * enough space in the inode root. The number of records in that + * child block might be less than the standard minrecs, but that's ok + * provided that there's only one direct child of the root. */ - ok_level = bs->cur->bc_nlevels - 1; - if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) - ok_level--; - if (level >= ok_level) + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 2) { + struct xfs_btree_block *root_block; + struct xfs_buf *root_bp; + int root_maxrecs; + + root_block = xfs_btree_get_block(cur, root_level, &root_bp); + root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level); + if (be16_to_cpu(root_block->bb_numrecs) != 1 || + numrecs <= root_maxrecs) + xchk_btree_set_corrupt(bs->sc, cur, level); return; + } - xchk_btree_set_corrupt(bs->sc, bs->cur, level); + /* + * Otherwise, only the root level is allowed to have fewer than minrecs + * records or keyptrs. + */ + if (level < root_level) + xchk_btree_set_corrupt(bs->sc, cur, level); } /* diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 7c432997edad..b045e95c2ea7 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -558,14 +558,27 @@ xchk_directory_leaf1_bestfree( /* Check all the bestfree entries. */ for (i = 0; i < bestcount; i++, bestp++) { best = be16_to_cpu(*bestp); - if (best == NULLDATAOFF) - continue; error = xfs_dir3_data_read(sc->tp, sc->ip, - i * args->geo->fsbcount, 0, &dbp); + xfs_dir2_db_to_da(args->geo, i), + XFS_DABUF_MAP_HOLE_OK, + &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) break; - xchk_directory_check_freesp(sc, lblk, dbp, best); + + if (!dbp) { + if (best != NULLDATAOFF) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, + lblk); + break; + } + continue; + } + + if (best == NULLDATAOFF) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + else + xchk_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) break; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3abb8b9d6f4c..7b9ff824e82d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -706,6 +706,23 @@ relock: return 0; } +/* + * Check that the imap we are going to return to the caller spans the entire + * range that the caller requested for the IO. + */ +static bool +imap_spans_range( + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + if (imap->br_startoff > offset_fsb) + return false; + if (imap->br_startoff + imap->br_blockcount < end_fsb) + return false; + return true; +} + static int xfs_direct_write_iomap_begin( struct inode *inode, @@ -766,6 +783,18 @@ xfs_direct_write_iomap_begin( if (imap_needs_alloc(inode, flags, &imap, nimaps)) goto allocate_blocks; + /* + * NOWAIT IO needs to span the entire requested IO with a single map so + * that we avoid partial IO failures due to the rest of the IO range not + * covered by this map triggering an EAGAIN condition when it is + * subsequently mapped and aborting the IO. + */ + if ((flags & IOMAP_NOWAIT) && + !imap_spans_range(&imap, offset_fsb, end_fsb)) { + error = -EAGAIN; + goto out_unlock; + } + xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 233dcc8784db..2a45138831e3 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -55,6 +55,9 @@ struct xfs_iwalk_ag { /* Where do we start the traversal? */ xfs_ino_t startino; + /* What was the last inode number we saw when iterating the inobt? */ + xfs_ino_t lastino; + /* Array of inobt records we cache. */ struct xfs_inobt_rec_incore *recs; @@ -301,6 +304,9 @@ xfs_iwalk_ag_start( if (XFS_IS_CORRUPT(mp, *has_more != 1)) return -EFSCORRUPTED; + iwag->lastino = XFS_AGINO_TO_INO(mp, agno, + irec->ir_startino + XFS_INODES_PER_CHUNK - 1); + /* * If the LE lookup yielded an inobt record before the cursor position, * skip it and see if there's another one after it. @@ -347,15 +353,17 @@ xfs_iwalk_run_callbacks( struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; struct xfs_inobt_rec_incore *irec; - xfs_agino_t restart; + xfs_agino_t next_agino; int error; + next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1; + ASSERT(iwag->nr_recs > 0); /* Delete cursor but remember the last record we cached... */ xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); irec = &iwag->recs[iwag->nr_recs - 1]; - restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; + ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK); error = xfs_iwalk_ag_recs(iwag); if (error) @@ -372,7 +380,7 @@ xfs_iwalk_run_callbacks( if (error) return error; - return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); + return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more); } /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ @@ -396,6 +404,7 @@ xfs_iwalk_ag( while (!error && has_more) { struct xfs_inobt_rec_incore *irec; + xfs_ino_t rec_fsino; cond_resched(); if (xfs_pwork_want_abort(&iwag->pwork)) @@ -407,6 +416,15 @@ xfs_iwalk_ag( if (error || !has_more) break; + /* Make sure that we always move forward. */ + rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino); + if (iwag->lastino != NULLFSINO && + XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { + error = -EFSCORRUPTED; + goto out; + } + iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1; + /* No allocated inodes in this chunk; skip it. */ if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) { error = xfs_btree_increment(cur, 0, &has_more); @@ -535,6 +553,7 @@ xfs_iwalk( .trim_start = 1, .skip_empty = 1, .pwork = XFS_PWORK_SINGLE_THREADED, + .lastino = NULLFSINO, }; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -623,6 +642,7 @@ xfs_iwalk_threaded( iwag->data = data; iwag->startino = startino; iwag->sz_recs = xfs_iwalk_prefetch(inode_records); + iwag->lastino = NULLFSINO; xfs_pwork_queue(&pctl, &iwag->pwork); startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); if (flags & XFS_INOBT_WALK_SAME_AG) @@ -696,6 +716,7 @@ xfs_inobt_walk( .startino = startino, .sz_recs = xfs_inobt_walk_prefetch(inobt_records), .pwork = XFS_PWORK_SINGLE_THREADED, + .lastino = NULLFSINO, }; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 150ee5cb8645..7110507a2b6b 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -194,20 +194,25 @@ xfs_initialize_perag( } pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); - if (!pag) + if (!pag) { + error = -ENOMEM; goto out_unwind_new_pags; + } pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - if (xfs_buf_hash_init(pag)) + + error = xfs_buf_hash_init(pag); + if (error) goto out_free_pag; init_waitqueue_head(&pag->pagb_wait); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; - if (radix_tree_preload(GFP_NOFS)) + error = radix_tree_preload(GFP_NOFS); + if (error) goto out_hash_destroy; spin_lock(&mp->m_perag_lock); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index ff5930be096c..bec47f2d074b 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -691,21 +691,23 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) bio->bi_opf |= REQ_FUA; ret = bio_iov_iter_get_pages(bio, from); - if (unlikely(ret)) { - bio_io_error(bio); - return ret; - } + if (unlikely(ret)) + goto out_release; + size = bio->bi_iter.bi_size; - task_io_account_write(ret); + task_io_account_write(size); if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(bio, iocb); ret = submit_bio_wait(bio); + zonefs_file_write_dio_end_io(iocb, size, ret, 0); + +out_release: + bio_release_pages(bio, false); bio_put(bio); - zonefs_file_write_dio_end_io(iocb, size, ret, 0); if (ret >= 0) { iocb->ki_pos += size; return size; |