diff options
Diffstat (limited to 'fs')
135 files changed, 2095 insertions, 1179 deletions
@@ -80,7 +80,7 @@ struct aio_ring { struct kioctx_table { struct rcu_head rcu; unsigned nr; - struct kioctx __rcu *table[]; + struct kioctx __rcu *table[] __counted_by(nr); }; struct kioctx_cpu { diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 43b2a2851ba3..206812ce544a 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -345,10 +345,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) /* there's now no turning back... the old userspace image is dead, * defunct, deceased, etc. */ + SET_PERSONALITY(exec_params.hdr); if (elf_check_fdpic(&exec_params.hdr)) - set_personality(PER_LINUX_FDPIC); - else - set_personality(PER_LINUX); + current->personality |= PER_LINUX_FDPIC; if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) current->personality |= READ_IMPLIES_EXEC; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a4cb4b642987..da519c1b6ad0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -682,18 +682,30 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (test_bit(BTRFS_ROOT_DELETING, &root->state)) - btrfs_err(fs_info, - "COW'ing blocks on a fs root that's being dropped"); - - if (trans->transaction != fs_info->running_transaction) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, - fs_info->running_transaction->transid); + if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, + "attempt to COW block %llu on root %llu that is being deleted", + buf->start, btrfs_root_id(root)); + return -EUCLEAN; + } - if (trans->transid != fs_info->generation) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, fs_info->generation); + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", + buf->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -805,8 +817,22 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, int progress_passed = 0; struct btrfs_disk_key disk_key; - WARN_ON(trans->transaction != fs_info->running_transaction); - WARN_ON(trans->transid != fs_info->generation); + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", + parent->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } parent_nritems = btrfs_header_nritems(parent); blocksize = fs_info->nodesize; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index caf0bbd028d1..90aaedce1548 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -313,7 +313,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, { struct btrfs_delayed_item *item; - item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); + item = kmalloc(struct_size(item, data, data_len), GFP_NOFS); if (item) { item->data_len = data_len; item->type = type; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index dc1085b2a397..1da213197f55 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -95,7 +95,7 @@ struct btrfs_delayed_item { bool logged; /* The maximum leaf size is 64K, so u16 is more than enough. */ u16 data_len; - char data[]; + char data[] __counted_by(data_len); }; static inline void btrfs_init_delayed_root( diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6a13cf00218b..9fe4ccca50a0 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -103,24 +103,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) * Transfer bytes to our delayed refs rsv. * * @fs_info: the filesystem - * @src: source block rsv to transfer from * @num_bytes: number of bytes to transfer * - * This transfers up to the num_bytes amount from the src rsv to the + * This transfers up to the num_bytes amount, previously reserved, to the * delayed_refs_rsv. Any extra bytes are returned to the space info. */ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes) { struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; u64 to_free = 0; - spin_lock(&src->lock); - src->reserved -= num_bytes; - src->size -= num_bytes; - spin_unlock(&src->lock); - spin_lock(&delayed_refs_rsv->lock); if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { u64 delta = delayed_refs_rsv->size - @@ -163,6 +156,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; + u64 refilled_bytes; + u64 to_free; int ret = -ENOSPC; spin_lock(&block_rsv->lock); @@ -178,9 +173,38 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (ret) return ret; - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); + + /* + * We may have raced with someone else, so check again if we the block + * reserve is still not full and release any excess space. + */ + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + u64 needed = block_rsv->size - block_rsv->reserved; + + if (num_bytes >= needed) { + block_rsv->reserved += needed; + block_rsv->full = true; + to_free = num_bytes - needed; + refilled_bytes = needed; + } else { + block_rsv->reserved += num_bytes; + to_free = 0; + refilled_bytes = num_bytes; + } + } else { + to_free = num_bytes; + refilled_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (to_free > 0) + btrfs_space_info_free_bytes_may_use(fs_info, block_rsv->space_info, + to_free); + + if (refilled_bytes > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, + refilled_bytes, 1); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b8e14b0ba5f1..fd9bf2b709c0 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -407,7 +407,6 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f356f08b55cb..fc313fce5bbd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1514,15 +1514,14 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_release_path(path); /* now insert the actual backref */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); + if (owner < BTRFS_FIRST_FREE_OBJECTID) ret = insert_tree_block_ref(trans, path, bytenr, parent, root_objectid); - } else { + else ret = insert_extent_data_ref(trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - } + if (ret) btrfs_abort_transaction(trans, ret); out: @@ -1656,7 +1655,10 @@ again: goto again; } } else { - err = -EIO; + err = -EUCLEAN; + btrfs_err(fs_info, + "missing extent item for extent %llu num_bytes %llu level %d", + head->bytenr, head->num_bytes, extent_op->level); goto out; } } @@ -1699,12 +1701,12 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent = ref->parent; ref_root = ref->root; - if (node->ref_mod != 1) { + if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, - "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", + "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); - return -EIO; + return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { BUG_ON(!extent_op || !extent_op->update_flags); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ac3fca5a5e41..caccd0376342 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -484,10 +484,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) bvec->bv_offset, bvec->bv_len); btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); - if (error) { - btrfs_page_clear_uptodate(fs_info, page, start, len); + if (error) mapping_set_error(page->mapping, error); - } btrfs_page_clear_writeback(fs_info, page, start, len); } @@ -1456,8 +1454,6 @@ done: if (ret) { btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, PAGE_SIZE, !ret); - btrfs_page_clear_uptodate(btrfs_sb(inode->i_sb), page, - page_start, PAGE_SIZE); mapping_set_error(page->mapping, ret); } unlock_page(page); @@ -1624,8 +1620,6 @@ static void extent_buffer_write_end_io(struct btrfs_bio *bbio) struct page *page = bvec->bv_page; u32 len = bvec->bv_len; - if (!uptodate) - btrfs_page_clear_uptodate(fs_info, page, start, len); btrfs_page_clear_writeback(fs_info, page, start, len); bio_offset += len; } @@ -2201,7 +2195,6 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page, if (ret) { btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, cur, cur_len, !ret); - btrfs_page_clear_uptodate(fs_info, page, cur, cur_len); mapping_set_error(page->mapping, ret); } btrfs_page_unlock_writer(fs_info, page, cur, cur_len); @@ -4002,8 +3995,14 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = get_eb_page_index(start); - if (check_eb_range(eb, start, len)) + if (check_eb_range(eb, start, len)) { + /* + * Invalid range hit, reset the memory, so callers won't get + * some random garbage for their uninitialzed memory. + */ + memset(dstv, 0, len); return; + } offset = get_eb_offset_in_page(eb, start); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ca46a529d56b..361535c71c0f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1106,6 +1106,25 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } +static void update_time_for_write(struct inode *inode) +{ + struct timespec64 now, ctime; + + if (IS_NOCMTIME(inode)) + return; + + now = current_time(inode); + if (!timespec64_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + ctime = inode_get_ctime(inode); + if (!timespec64_equal(&ctime, &now)) + inode_set_ctime_to_ts(inode, now); + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) { @@ -1137,10 +1156,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - if (!IS_NOCMTIME(inode)) { - inode->i_mtime = inode_set_ctime_current(inode); - inode_inc_iversion(inode); - } + update_time_for_write(inode); start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); @@ -1451,8 +1467,13 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; - /* If the write DIO is within EOF, use a shared lock */ - if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode)) + /* + * If the write DIO is within EOF, use a shared lock and also only if + * security bits will likely not be dropped by file_remove_privs() called + * from btrfs_write_check(). Either will need to be rechecked after the + * lock was acquired. + */ + if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) ilock_flags |= BTRFS_ILOCK_SHARED; relock: @@ -1460,6 +1481,13 @@ relock: if (err < 0) return err; + /* Shared lock cannot be used with security bits set. */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + err = generic_write_checks(iocb, from); if (err <= 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f09fbdc43f0f..7814b9d654ce 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1085,9 +1085,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, btrfs_mark_ordered_io_finished(inode, locked_page, page_start, PAGE_SIZE, !ret); - btrfs_page_clear_uptodate(inode->root->fs_info, - locked_page, page_start, - PAGE_SIZE); mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } @@ -2791,7 +2788,6 @@ out_page: mapping_set_error(page->mapping, ret); btrfs_mark_ordered_io_finished(inode, page, page_start, PAGE_SIZE, !ret); - btrfs_page_clear_uptodate(fs_info, page, page_start, PAGE_SIZE); clear_page_dirty_for_io(page); } btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); @@ -5769,20 +5765,24 @@ out: static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) { - if (dir->index_cnt == (u64)-1) { - int ret; + int ret = 0; + btrfs_inode_lock(dir, 0); + if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) - return ret; + goto out; } } - *index = dir->index_cnt; + /* index_cnt is the index number of next new entry, so decrement it. */ + *index = dir->index_cnt - 1; +out: + btrfs_inode_unlock(dir, 0); - return 0; + return ret; } /* @@ -5817,6 +5817,19 @@ static int btrfs_opendir(struct inode *inode, struct file *file) return 0; } +static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct btrfs_file_private *private = file->private_data; + int ret; + + ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), + &private->last_index); + if (ret) + return ret; + + return generic_file_llseek(file, offset, whence); +} + struct dir_entry { u64 ino; u64 offset; @@ -10868,7 +10881,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { }; static const struct file_operations btrfs_dir_file_operations = { - .llseek = generic_file_llseek, + .llseek = btrfs_dir_llseek, .read = generic_read_dir, .iterate_shared = btrfs_real_readdir, .open = btrfs_opendir, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 75ab766fe156..8e7d03bc1b56 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2978,7 +2978,7 @@ static void get_block_group_info(struct list_head *groups_list, static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; @@ -4338,7 +4338,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_send_args_32 args32; + struct btrfs_ioctl_send_args_32 args32 = { 0 }; ret = copy_from_user(&args32, argp, sizeof(args32)); if (ret) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 09bfe68d2ea3..1a093ec0f7e3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2117,7 +2117,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * calculated f_bavail. */ if (!mixed && block_rsv->space_info->full && - total_free_meta - thresh < block_rsv->size) + (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; @@ -2150,7 +2150,7 @@ static struct file_system_type btrfs_fs_type = { .name = "btrfs", .mount = btrfs_mount, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, }; static struct file_system_type btrfs_root_fs_type = { @@ -2158,8 +2158,7 @@ static struct file_system_type btrfs_root_fs_type = { .name = "btrfs", .mount = btrfs_mount_root, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | - FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("btrfs"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0bf42dccb041..c780d3729463 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -631,14 +631,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush); if (ret) goto reserve_fail; if (delayed_refs_bytes) { - btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, - delayed_refs_bytes); + btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes); num_bytes -= delayed_refs_bytes; } + btrfs_block_rsv_add_bytes(rsv, num_bytes, true); if (rsv->space_info->force_alloc) do_chunk_alloc = true; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6b309f8a99a8..93869cda6af9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -219,8 +219,8 @@ do { \ (errno))) { \ /* Stack trace printed. */ \ } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ (errno)); \ } \ } \ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d1e46b839519..cbb17b542131 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4722,7 +4722,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; int ins_nr = 0; - int start_slot; + int start_slot = 0; int ret; if (!(inode->flags & BTRFS_INODE_PREALLOC)) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index c5ff16f9e9fa..744f4f4d4c68 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -715,7 +715,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - struct page *page; + struct folio *folio; u64 off = (u64)index << PAGE_SHIFT; loff_t merkle_pos = merkle_file_pos(inode); int ret; @@ -726,29 +726,36 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, return ERR_PTR(-EFBIG); index += merkle_pos >> PAGE_SHIFT; again: - page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); - if (page) { - if (PageUptodate(page)) - return page; + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (!IS_ERR(folio)) { + if (folio_test_uptodate(folio)) + goto out; - lock_page(page); - /* - * We only insert uptodate pages, so !Uptodate has to be - * an error - */ - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + folio_lock(folio); + /* If it's not uptodate after we have the lock, we got a read error. */ + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } - unlock_page(page); - return page; + folio_unlock(folio); + goto out; } - page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), + 0); + if (!folio) return ERR_PTR(-ENOMEM); + ret = filemap_add_folio(inode->i_mapping, folio, index, GFP_NOFS); + if (ret) { + folio_put(folio); + /* Did someone else insert a folio here? */ + if (ret == -EEXIST) + goto again; + return ERR_PTR(ret); + } + /* * Merkle item keys are indexed from byte 0 in the merkle tree. * They have the form: @@ -756,28 +763,19 @@ again: * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - page_address(page), PAGE_SIZE, page); + folio_address(folio), PAGE_SIZE, &folio->page); if (ret < 0) { - put_page(page); + folio_put(folio); return ERR_PTR(ret); } if (ret < PAGE_SIZE) - memzero_page(page, ret, PAGE_SIZE - ret); + folio_zero_segment(folio, ret, PAGE_SIZE); - SetPageUptodate(page); - ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS); + folio_mark_uptodate(folio); + folio_unlock(folio); - if (!ret) { - /* Inserted and ready for fsverity */ - unlock_page(page); - } else { - put_page(page); - /* Did someone race us into inserting this page? */ - if (ret == -EEXIST) - goto again; - page = ERR_PTR(ret); - } - return page; +out: + return folio_file_page(folio, index); } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9621455edebc..b9ef6f54635c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1594,7 +1594,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 search_start; u64 hole_size; u64 max_hole_start; - u64 max_hole_size; + u64 max_hole_size = 0; u64 extent_end; u64 search_end = device->total_bytes; int ret; @@ -1602,17 +1602,16 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, struct extent_buffer *l; search_start = dev_extent_search_start(device); + max_hole_start = search_start; WARN_ON(device->zone_info && !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - max_hole_start = search_start; - max_hole_size = 0; - + if (!path) { + ret = -ENOMEM; + goto out; + } again: if (search_start >= search_end || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { @@ -5110,7 +5109,7 @@ static void init_alloc_chunk_ctl_policy_regular( ASSERT(space_info); ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); - ctl->max_stripe_size = ctl->max_chunk_size; + ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); diff --git a/fs/buffer.c b/fs/buffer.c index 2379564e5aea..12e9a71c693d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2011,7 +2011,7 @@ void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to) } EXPORT_SYMBOL(folio_zero_new_buffers); -static void +static int iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, const struct iomap *iomap) { @@ -2025,7 +2025,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, * current block, then do not map the buffer and let the caller * handle it. */ - BUG_ON(offset >= iomap->offset + iomap->length); + if (offset >= iomap->offset + iomap->length) + return -EIO; switch (iomap->type) { case IOMAP_HOLE: @@ -2037,7 +2038,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) set_buffer_new(bh); - break; + return 0; case IOMAP_DELALLOC: if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) @@ -2045,7 +2046,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, set_buffer_uptodate(bh); set_buffer_mapped(bh); set_buffer_delay(bh); - break; + return 0; case IOMAP_UNWRITTEN: /* * For unwritten regions, we always need to ensure that regions @@ -2057,12 +2058,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, fallthrough; case IOMAP_MAPPED: if ((iomap->flags & IOMAP_F_NEW) || - offset >= i_size_read(inode)) + offset >= i_size_read(inode)) { + /* + * This can happen if truncating the block device races + * with the check in the caller as i_size updates on + * block devices aren't synchronized by i_rwsem for + * block devices. + */ + if (S_ISBLK(inode->i_mode)) + return -EIO; set_buffer_new(bh); + } bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> inode->i_blkbits; set_buffer_mapped(bh); - break; + return 0; + default: + WARN_ON_ONCE(1); + return -EIO; } } @@ -2103,13 +2116,12 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - if (get_block) { + if (get_block) err = get_block(inode, block, bh, 1); - if (err) - break; - } else { - iomap_to_bh(inode, block, bh, iomap); - } + else + err = iomap_to_bh(inode, block, bh, iomap); + if (err) + break; if (buffer_new(bh)) { clean_bdev_bh_alias(bh); diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index e4d5cd56a80b..5b5112c78462 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -249,11 +249,9 @@ static struct inode *parse_longname(const struct inode *parent, if (!dir) { /* This can happen if we're not mounting cephfs on the root */ dir = ceph_get_inode(parent->i_sb, vino, NULL); - if (!dir) - dir = ERR_PTR(-ENOENT); + if (IS_ERR(dir)) + dout("Can't find inode %s (%s)\n", inode_number, name); } - if (IS_ERR(dir)) - dout("Can't find inode %s (%s)\n", inode_number, name); out: kfree(inode_number); @@ -462,7 +460,7 @@ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, out: fscrypt_fname_free_buffer(&_tname); out_inode: - if ((dir != fname->dir) && !IS_ERR(dir)) { + if (dir != fname->dir) { if ((dir->i_state & I_NEW)) discard_new_inode(dir); else diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b1da02f5dbe3..b5f8038065d7 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2969,7 +2969,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, ret = do_splice_direct(src_file, &src_off, dst_file, &dst_off, src_objlen, flags); /* Abort on short copies or on error */ - if (ret < src_objlen) { + if (ret < (long)src_objlen) { dout("Failed partial copy (%zd)\n", ret); goto out; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 800ab7920513..b79100f720b3 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -769,9 +769,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, ci->i_truncate_seq = truncate_seq; /* the MDS should have revoked these caps */ - WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR | + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_LAZYIO)); /* * If we hold relevant caps, or in the case where we're diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 73091fbe3ea4..dee10d22ada9 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -217,9 +217,12 @@ again: strm->buf.out_size = min_t(u32, outlen, PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; - if (!rq->out[no] && rq->fillgaps) /* deduped */ + if (!rq->out[no] && rq->fillgaps) { /* deduped */ rq->out[no] = erofs_allocpage(pagepool, GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } if (rq->out[no]) strm->buf.out = kmap(rq->out[no]) + pageofs; pageofs = 0; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 44a24d573f1f..3700af9ee173 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -235,7 +235,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(ptr); dis = ptr + erofs_blkoff(sb, *pos); - if (!dif->path) { + if (!sbi->devs->flatdev && !dif->path) { if (!dis->tag[0]) { erofs_err(sb, "empty device tag @ pos %llu", *pos); return -EINVAL; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 38217422f938..dbebd8b3127e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -7314,7 +7314,7 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 969ce991b0b0..c1af01b2c42d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1535,10 +1535,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, if (wbc->pages_skipped) { /* - * writeback is not making progress due to locked - * buffers. Skip this inode for now. + * Writeback is not making progress due to locked buffers. + * Skip this inode for now. Although having skipped pages + * is odd for clean inodes, it can happen for some + * filesystems so handle that gracefully. */ - redirty_tail_locked(inode, wb); + if (inode->i_state & I_DIRTY_ALL) + redirty_tail_locked(inode, wb); + else + inode_cgwb_move_to_attached(inode, wb); return; } diff --git a/fs/fs_context.c b/fs/fs_context.c index a0ad7a0c4680..98589aae5208 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -192,17 +192,19 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key, EXPORT_SYMBOL(vfs_parse_fs_string); /** - * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data + * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data * @fc: The superblock configuration to fill in. * @data: The data to parse + * @sep: callback for separating next option * - * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be - * called from the ->monolithic_mount_data() fs_context operation. + * Parse a blob of data that's in key[=val][,key[=val]]* form with a custom + * option separator callback. * * Returns 0 on success or the error returned by the ->parse_option() fs_context * operation on failure. */ -int generic_parse_monolithic(struct fs_context *fc, void *data) +int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, + char *(*sep)(char **)) { char *options = data, *key; int ret = 0; @@ -214,7 +216,7 @@ int generic_parse_monolithic(struct fs_context *fc, void *data) if (ret) return ret; - while ((key = strsep(&options, ",")) != NULL) { + while ((key = sep(&options)) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); @@ -233,6 +235,28 @@ int generic_parse_monolithic(struct fs_context *fc, void *data) return ret; } +EXPORT_SYMBOL(vfs_parse_monolithic_sep); + +static char *vfs_parse_comma_sep(char **s) +{ + return strsep(s, ","); +} + +/** + * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data + * @fc: The superblock configuration to fill in. + * @data: The data to parse + * + * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be + * called from the ->monolithic_mount_data() fs_context operation. + * + * Returns 0 on success or the error returned by the ->parse_option() fs_context + * operation on failure. + */ +int generic_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, vfs_parse_comma_sep); +} EXPORT_SYMBOL(generic_parse_monolithic); /** diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9cbf8d98489a..4a280be229a6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2010,7 +2010,9 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { if (!spin_trylock(&gl->gl_lockref.lock)) continue; - if (!gl->gl_lockref.count) { + if (gl->gl_lockref.count <= 1 && + (gl->gl_state == LM_ST_UNLOCKED || + demote_ok(gl))) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d26759a98b10..f41ca89d216b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -567,15 +567,16 @@ static void freeze_go_callback(struct gfs2_glock *gl, bool remote) struct super_block *sb = sdp->sd_vfs; if (!remote || - gl->gl_state != LM_ST_SHARED || + (gl->gl_state != LM_ST_SHARED && + gl->gl_state != LM_ST_UNLOCKED) || gl->gl_demote_state != LM_ST_UNLOCKED) return; /* * Try to get an active super block reference to prevent racing with - * unmount (see trylock_super()). But note that unmount isn't the only - * place where a write lock on s_umount is taken, and we can fail here - * because of things like remount as well. + * unmount (see super_trylock_shared()). But note that unmount isn't + * the only place where a write lock on s_umount is taken, and we can + * fail here because of things like remount as well. */ if (down_read_trylock(&sb->s_umount)) { atomic_inc(&sb->s_active); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 21ada332d555..1429945215a0 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -50,7 +50,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) return ret; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON && + sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) return 0; ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap); if (ret) diff --git a/fs/inode.c b/fs/inode.c index 35fd688168c5..84bc3c76e5cc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2102,52 +2102,10 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); -/** - * current_mgtime - Return FS time (possibly fine-grained) - * @inode: inode. - * - * Return the current time truncated to the time granularity supported by - * the fs, as suitable for a ctime/mtime change. If the ctime is flagged - * as having been QUERIED, get a fine-grained timestamp. - */ -struct timespec64 current_mgtime(struct inode *inode) -{ - struct timespec64 now, ctime; - atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec; - long nsec = atomic_long_read(pnsec); - - if (nsec & I_CTIME_QUERIED) { - ktime_get_real_ts64(&now); - return timestamp_truncate(now, inode); - } - - ktime_get_coarse_real_ts64(&now); - now = timestamp_truncate(now, inode); - - /* - * If we've recently fetched a fine-grained timestamp - * then the coarse-grained one may still be earlier than the - * existing ctime. Just keep the existing value if so. - */ - ctime = inode_get_ctime(inode); - if (timespec64_compare(&ctime, &now) > 0) - now = ctime; - - return now; -} -EXPORT_SYMBOL(current_mgtime); - -static struct timespec64 current_ctime(struct inode *inode) -{ - if (is_mgtime(inode)) - return current_mgtime(inode); - return current_time(inode); -} - static int inode_needs_update_time(struct inode *inode) { int sync_it = 0; - struct timespec64 now = current_ctime(inode); + struct timespec64 now = current_time(inode); struct timespec64 ctime; /* First try to exhaust all avenues to not sync */ @@ -2578,43 +2536,9 @@ EXPORT_SYMBOL(current_time); */ struct timespec64 inode_set_ctime_current(struct inode *inode) { - struct timespec64 now; - struct timespec64 ctime; - - ctime.tv_nsec = READ_ONCE(inode->__i_ctime.tv_nsec); - if (!(ctime.tv_nsec & I_CTIME_QUERIED)) { - now = current_time(inode); + struct timespec64 now = current_time(inode); - /* Just copy it into place if it's not multigrain */ - if (!is_mgtime(inode)) { - inode_set_ctime_to_ts(inode, now); - return now; - } - - /* - * If we've recently updated with a fine-grained timestamp, - * then the coarse-grained one may still be earlier than the - * existing ctime. Just keep the existing value if so. - */ - ctime.tv_sec = inode->__i_ctime.tv_sec; - if (timespec64_compare(&ctime, &now) > 0) - return ctime; - - /* - * Ctime updates are usually protected by the inode_lock, but - * we can still race with someone setting the QUERIED flag. - * Try to swap the new nsec value into place. If it's changed - * in the interim, then just go with a fine-grained timestamp. - */ - if (cmpxchg(&inode->__i_ctime.tv_nsec, ctime.tv_nsec, - now.tv_nsec) != ctime.tv_nsec) - goto fine_grained; - inode->__i_ctime.tv_sec = now.tv_sec; - return now; - } -fine_grained: - ktime_get_real_ts64(&now); - inode_set_ctime_to_ts(inode, timestamp_truncate(now, inode)); + inode_set_ctime(inode, now.tv_sec, now.tv_nsec); return now; } EXPORT_SYMBOL(inode_set_ctime_current); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ae8673ce08b1..2bc0aa23fde3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -640,11 +640,13 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t poff, plen; /* - * If the write completely overlaps the current folio, then + * If the write or zeroing completely overlaps the current folio, then * entire folio will be dirtied so there is no need for * per-block state tracking structures to be attached to this folio. + * For the unshare case, we must read in the ondisk contents because we + * are not changing pagecache contents. */ - if (pos <= folio_pos(folio) && + if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && pos + len >= folio_pos(folio) + folio_size(folio)) return 0; @@ -879,8 +881,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ + bytes = iov_iter_count(i); +retry: offset = pos & (chunk - 1); - bytes = min(chunk - offset, iov_iter_count(i)); + bytes = min(chunk - offset, bytes); status = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); if (unlikely(status)) @@ -931,10 +935,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) * halfway through, might be a race with munmap, * might be severe memory pressure. */ - if (copied) - bytes = copied; if (chunk > PAGE_SIZE) chunk /= 2; + if (copied) { + bytes = copied; + goto retry; + } } else { pos += status; written += status; @@ -1047,7 +1053,7 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, /* * Scan the data range passed to us for dirty page cache folios. If we find a - * dirty folio, punch out the preceeding range and update the offset from which + * dirty folio, punch out the preceding range and update the offset from which * the next punch will start from. * * We can punch out storage reservations under clean pages because they either @@ -1261,7 +1267,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); - long status = 0; loff_t written = 0; /* don't bother with blocks that are not shared to start with */ @@ -1272,28 +1277,33 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) return length; do { - unsigned long offset = offset_in_page(pos); - unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct folio *folio; + int status; + size_t offset; + size_t bytes = min_t(u64, SIZE_MAX, length); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; - if (iter->iomap.flags & IOMAP_F_STALE) + if (iomap->flags & IOMAP_F_STALE) break; - status = iomap_write_end(iter, pos, bytes, bytes, folio); - if (WARN_ON_ONCE(status == 0)) + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + + bytes = iomap_write_end(iter, pos, bytes, bytes, folio); + if (WARN_ON_ONCE(bytes == 0)) return -EIO; cond_resched(); - pos += status; - written += status; - length -= status; + pos += bytes; + written += bytes; + length -= bytes; balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length); + } while (length > 0); return written; } diff --git a/fs/libfs.c b/fs/libfs.c index a4eb12757886..37f2d34ee090 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1903,6 +1903,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ + iocb->ki_pos -= buffered_written; if (direct_written) return direct_written; return err; diff --git a/fs/namei.c b/fs/namei.c index 567ee547492b..94565bd7e73f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty) } } - result->refcnt = 1; + atomic_set(&result->refcnt, 1); /* The empty path is special. */ if (unlikely(!len)) { if (empty) @@ -249,7 +249,7 @@ getname_kernel(const char * filename) memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; - result->refcnt = 1; + atomic_set(&result->refcnt, 1); audit_getname(result); return result; @@ -261,9 +261,10 @@ void putname(struct filename *name) if (IS_ERR(name)) return; - BUG_ON(name->refcnt <= 0); + if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) + return; - if (--name->refcnt > 0) + if (!atomic_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 3404707ddbe7..2cd3ccf4c439 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -47,12 +47,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) xas_for_each(&xas, folio, last_page) { loff_t pg_end; bool pg_failed = false; + bool folio_started; if (xas_retry(&xas, folio)) continue; pg_end = folio_pos(folio) + folio_size(folio) - 1; + folio_started = false; for (;;) { loff_t sreq_end; @@ -60,8 +62,10 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { folio_start_fscache(folio); + folio_started = true; + } pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 47d892a1d363..f6c74f424691 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -93,12 +93,10 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq, dreq->max_count = dreq_len; if (dreq->count > dreq_len) dreq->count = dreq_len; - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - else /* Clear outstanding error if this is EOF */ - dreq->error = 0; } + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) + dreq->error = hdr->error; } static void @@ -120,6 +118,18 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, dreq->count = dreq_len; } +static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, + struct nfs_page *req) +{ + loff_t offs = req_offset(req); + size_t req_start = (size_t)(offs - dreq->io_start); + + if (req_start < dreq->max_count) + dreq->max_count = req_start; + if (req_start < dreq->count) + dreq->count = req_start; +} + /** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block @@ -488,7 +498,9 @@ static void nfs_direct_add_page_head(struct list_head *list, kref_get(&head->wb_kref); } -static void nfs_direct_join_group(struct list_head *list, struct inode *inode) +static void nfs_direct_join_group(struct list_head *list, + struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *req, *subreq; @@ -510,7 +522,7 @@ static void nfs_direct_join_group(struct list_head *list, struct inode *inode) nfs_release_request(subreq); } } while ((subreq = subreq->wb_this_page) != req); - nfs_join_page_group(req, inode); + nfs_join_page_group(req, cinfo, inode); } } @@ -528,20 +540,15 @@ nfs_direct_write_scan_commit_list(struct inode *inode, static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; - struct nfs_page *req, *tmp; + struct nfs_page *req; LIST_HEAD(reqs); struct nfs_commit_info cinfo; - LIST_HEAD(failed); nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); - nfs_direct_join_group(&reqs, dreq->inode); + nfs_direct_join_group(&reqs, &cinfo, dreq->inode); - dreq->count = 0; - dreq->max_count = 0; - list_for_each_entry(req, &reqs, wb_list) - dreq->max_count += req->wb_bytes; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); get_dreq(dreq); @@ -549,27 +556,40 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); /* Bump the transmission count */ req->wb_nio++; if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_move_request(req, &failed); - spin_lock(&cinfo.inode->i_lock); - dreq->flags = 0; - if (desc.pg_error < 0) + spin_lock(&dreq->lock); + if (dreq->error < 0) { + desc.pg_error = dreq->error; + } else if (desc.pg_error != -EAGAIN) { + dreq->flags = 0; + if (!desc.pg_error) + desc.pg_error = -EIO; dreq->error = desc.pg_error; - else - dreq->error = -EIO; - spin_unlock(&cinfo.inode->i_lock); + } else + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + break; } nfs_release_request(req); } nfs_pageio_complete(&desc); - while (!list_empty(&failed)) { - req = nfs_list_entry(failed.next); + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); nfs_unlock_and_release_request(req); + if (desc.pg_error == -EAGAIN) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } else { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } } if (put_dreq(dreq)) @@ -589,8 +609,6 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) if (status < 0) { /* Errors in commit are fatal */ dreq->error = status; - dreq->max_count = 0; - dreq->count = 0; dreq->flags = NFS_ODIRECT_DONE; } else { status = dreq->error; @@ -601,7 +619,12 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (status >= 0 && !nfs_write_match_verf(verf, req)) { + if (status < 0) { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } else if (!nfs_write_match_verf(verf, req)) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; /* * Despite the reboot, the write was successful, @@ -609,7 +632,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) */ req->wb_nio = 0; nfs_mark_request_commit(req, NULL, &cinfo, 0); - } else /* Error or match */ + } else nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -662,6 +685,7 @@ static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) while (!list_empty(&reqs)) { req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); + nfs_direct_truncate_request(dreq, req); nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -711,7 +735,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) { + if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && + !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; flags = dreq->flags; @@ -755,18 +780,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head, int error) static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_page *req; + struct nfs_commit_info cinfo; trace_nfs_direct_write_reschedule_io(dreq); + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - if (dreq->error == 0) { + if (dreq->error == 0) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.offset + hdr->args.count - - hdr->io_start; - } + set_bit(NFS_IOHDR_REDO, &hdr->flags); spin_unlock(&dreq->lock); + while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } } static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { @@ -794,9 +824,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; + struct nfs_commit_info cinfo; ssize_t result = 0; size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + bool defer = false; trace_nfs_direct_write_schedule_iovec(dreq); @@ -837,17 +869,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } - nfs_lock_request(req); - if (!nfs_pageio_add_request(&desc, req)) { - result = desc.pg_error; - nfs_unlock_and_release_request(req); - break; - } pgbase = 0; bytes -= req_len; requested_bytes += req_len; pos += req_len; dreq->bytes_left -= req_len; + + if (defer) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + continue; + } + + nfs_lock_request(req); + if (nfs_pageio_add_request(&desc, req)) + continue; + + /* Exit on hard errors */ + if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + + /* If the error is soft, defer remaining requests */ + nfs_init_cinfo_from_dreq(&cinfo, dreq); + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + desc.pg_error = 0; + defer = true; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7deb3cd76abe..ef817a0475ff 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1235,6 +1235,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -EPFNOSUPPORT: case -EPROTONOSUPPORT: case -EOPNOTSUPP: + case -EINVAL: case -ECONNREFUSED: case -ECONNRESET: case -EHOSTDOWN: @@ -2519,9 +2520,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, return i; } -static int -ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) +static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) { + struct pnfs_layout_hdr *lo; struct nfs4_flexfile_layout *ff_layout; const int dev_count = PNFS_LAYOUTSTATS_MAXDEV; @@ -2532,11 +2533,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) return -ENOMEM; spin_lock(&args->inode->i_lock); - ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); - args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &args->devinfo[0], - dev_count, - NFS4_FF_OP_LAYOUTSTATS); + lo = NFS_I(args->inode)->layout; + if (lo && pnfs_layout_is_valid(lo)) { + ff_layout = FF_LAYOUT_FROM_HDR(lo); + args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &args->devinfo[0], dev_count, + NFS4_FF_OP_LAYOUTSTATS); + } else + args->num_dev = 0; spin_unlock(&args->inode->i_lock); if (!args->num_dev) { kfree(args->devinfo); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 063e00aff87e..28704f924612 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -81,7 +81,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (status == 0) { if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + nfs_set_cache_invalid(inode, + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE); spin_unlock(&inode->i_lock); } status = nfs_post_op_update_inode_force_wcc(inode, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 27fb25567ce7..11e3a285594c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -417,6 +417,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) .net = old->cl_net, .servername = old->cl_hostname, }; + int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ? + clp->cl_max_connect : old->cl_max_connect; if (clp->cl_proto != old->cl_proto) return; @@ -430,7 +432,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) xprt_args.addrlen = clp_salen; rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, &max_connect); } /** @@ -1010,6 +1012,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); __set_bit(NFS_CS_DS, &cl_init.init_flags); + __set_bit(NFS_CS_PNFS, &cl_init.init_flags); + cl_init.max_connect = NFS_MAX_TRANSPORTS; /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 794343790ea8..5ee283eb9660 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2703,8 +2703,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data, return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + struct nfs_fh *fh = &o_res->fh; + nfs4_sequence_free_slot(&o_res->seq_res); - nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL); + if (o_arg->claim == NFS4_OPEN_CLAIM_FH) + fh = NFS_FH(d_inode(data->dentry)); + nfs4_proc_getattr(server, fh, o_res->f_attr, NULL); } return 0; } @@ -8866,8 +8870,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre /* Save the EXCHANGE_ID verifier session trunk tests */ memcpy(clp->cl_confirm.data, argp->verifier.data, sizeof(clp->cl_confirm.data)); - if (resp->flags & EXCHGID4_FLAG_USE_PNFS_DS) - set_bit(NFS_CS_DS, &clp->cl_flags); out: trace_nfs4_exchange_id(clp, status); rpc_put_task(task); @@ -10618,7 +10620,9 @@ static void nfs4_disable_swap(struct inode *inode) */ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + wake_up_var(&clp->cl_state); } static const struct inode_operations nfs4_dir_inode_operations = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e079987af4a3..9a5d911a7edc 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *clnt = clp->cl_rpcclient; + bool swapon = false; - if (clp->cl_rpcclient->cl_shutdown) + if (clnt->cl_shutdown) return; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { - wake_up_var(&clp->cl_state); - return; + + if (atomic_read(&clnt->cl_swapper)) { + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, + &clp->cl_state); + if (!swapon) { + wake_up_var(&clp->cl_state); + return; + } } - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; + __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) __func__, PTR_ERR(task)); if (!nfs_client_init_is_complete(clp)) nfs_mark_client_ready(clp, PTR_ERR(task)); + if (swapon) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs4_clear_state_manager_bit(clp); - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -2703,6 +2714,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); + if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, + &clp->cl_state)) { + memflags = memalloc_nofs_save(); + continue; + } + if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) { if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); @@ -2741,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); again: - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); - if (atomic_read(&cl->cl_swapper)) { + + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && + !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { wait_var_event_interruptible(&clp->cl_state, test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)); - if (atomic_read(&cl->cl_swapper) && - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + if (!atomic_read(&cl->cl_swapper)) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; /* Either no longer a swapper, or were signalled */ + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); } - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); if (refcount_read(&clp->cl_count) > 1 && !signalled() && test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; nfs_put_client(clp); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 306cba0b9e69..84343aefbbd6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2634,31 +2634,44 @@ pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo, return mode == 0; } -static int -pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data) +static int pnfs_layout_return_unused_byserver(struct nfs_server *server, + void *data) { const struct pnfs_layout_range *range = data; + const struct cred *cred; struct pnfs_layout_hdr *lo; struct inode *inode; + nfs4_stateid stateid; + enum pnfs_iomode iomode; + restart: rcu_read_lock(); list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { - if (!pnfs_layout_can_be_returned(lo) || + inode = lo->plh_inode; + if (!inode || !pnfs_layout_can_be_returned(lo) || test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) continue; - inode = lo->plh_inode; spin_lock(&inode->i_lock); - if (!pnfs_should_return_unused_layout(lo, range)) { + if (!lo->plh_inode || + !pnfs_should_return_unused_layout(lo, range)) { spin_unlock(&inode->i_lock); continue; } + pnfs_get_layout_hdr(lo); + pnfs_set_plh_return_info(lo, range->iomode, 0); + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, + range, 0) != 0 || + !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) { + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + pnfs_put_layout_hdr(lo); + cond_resched(); + goto restart; + } spin_unlock(&inode->i_lock); - inode = pnfs_grab_inode_layout_hdr(lo); - if (!inode) - continue; rcu_read_unlock(); - pnfs_mark_layout_for_return(inode, range); - iput(inode); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); + pnfs_put_layout_hdr(lo); cond_resched(); goto restart; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f4cca8f00c0c..9d82d50ce0b1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,7 +59,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_inode_remove_request(struct nfs_page *req); -static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); static struct nfs_page * @@ -502,8 +503,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. */ -void -nfs_join_page_group(struct nfs_page *head, struct inode *inode) +void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; @@ -533,7 +534,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_clear_request_commit(subreq); + nfs_clear_request_commit(cinfo, subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -566,8 +567,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *head; + struct nfs_commit_info cinfo; int ret; + nfs_init_cinfo_from_inode(&cinfo, inode); /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed @@ -584,7 +587,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) return ERR_PTR(ret); } - nfs_join_page_group(head, inode); + nfs_join_page_group(head, &cinfo, inode); return head; } @@ -785,6 +788,8 @@ static void nfs_inode_add_request(struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { + struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); + if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio_file_mapping(folio); @@ -799,8 +804,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { + atomic_long_dec(&nfsi->nrequests); nfs_release_request(req); - atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests); } } @@ -955,18 +960,16 @@ static void nfs_folio_clear_commit(struct folio *folio) } /* Called holding the request lock on @req */ -static void -nfs_clear_request_commit(struct nfs_page *req) +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { struct nfs_open_context *ctx = nfs_req_openctx(req); struct inode *inode = d_inode(ctx->dentry); - struct nfs_commit_info cinfo; - nfs_init_cinfo_from_inode(&cinfo, inode); mutex_lock(&NFS_I(inode)->commit_mutex); - if (!pnfs_clear_request_commit(req, &cinfo)) { - nfs_request_remove_commit_list(req, &cinfo); + if (!pnfs_clear_request_commit(req, cinfo)) { + nfs_request_remove_commit_list(req, cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_folio_clear_commit(nfs_page_to_folio(req)); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2e40c74d2f72..92c7dde148a4 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -4113,6 +4113,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, struct file *file, unsigned long maxcount) { struct xdr_stream *xdr = resp->xdr; + unsigned int base = xdr->buf->page_len & ~PAGE_MASK; unsigned int starting_len = xdr->buf->len; __be32 zero = xdr_zero; __be32 nfserr; @@ -4121,8 +4122,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, return nfserr_resource; nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file, - read->rd_offset, &maxcount, - xdr->buf->page_len & ~PAGE_MASK, + read->rd_offset, &maxcount, base, &read->rd_eof); read->rd_length = maxcount; if (nfserr) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 48fe71d309cb..8beb2730929d 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, struct the_nilfs *nilfs = inode->i_sb->s_fs_info; err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); - if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ - brelse(bh); + if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */ goto failed; - } } lock_buffer(bh); @@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, failed: unlock_page(bh->b_page); put_page(bh->b_page); + if (unlikely(err)) + brelse(bh); return err; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f69c451018e3..62fe0b679e58 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1585,16 +1585,25 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) } /* Check if filesystem can encode a unique fid */ -static int fanotify_test_fid(struct dentry *dentry) +static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + const struct export_operations *nop = dentry->d_sb->s_export_op; + + /* + * We need to make sure that the filesystem supports encoding of + * file handles so user can use name_to_handle_at() to compare fids + * reported with events to the file handle of watched objects. + */ + if (!nop) + return -EOPNOTSUPP; + /* - * We need to make sure that the file system supports at least - * encoding a file handle so user can use name_to_handle_at() to - * compare fid returned with event to the file handle of watched - * objects. However, even the relaxed AT_HANDLE_FID flag requires - * at least empty export_operations for ecoding unique file ids. + * For sb/mount mark, we also need to make sure that the filesystem + * supports decoding file handles, so user has a way to map back the + * reported fids to filesystem objects. */ - if (!dentry->d_sb->s_export_op) + if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry) return -EOPNOTSUPP; return 0; @@ -1812,7 +1821,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto path_put_and_out; - ret = fanotify_test_fid(path.dentry); + ret = fanotify_test_fid(path.dentry, flags); if (ret) goto path_put_and_out; diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index a9d82bbb4729..63f70259edc0 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -1106,10 +1106,10 @@ repack: } } - /* + /* * The code below may require additional cluster (to extend attribute list) - * and / or one MFT record - * It is too complex to undo operations if -ENOSPC occurs deep inside + * and / or one MFT record + * It is too complex to undo operations if -ENOSPC occurs deep inside * in 'ni_insert_nonresident'. * Return in advance -ENOSPC here if there are no free cluster and no free MFT. */ @@ -1736,10 +1736,8 @@ repack: le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); - if (!attr_b) { - err = -ENOENT; - goto out; - } + if (!attr_b) + return -ENOENT; attr = attr_b; le = le_b; diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 42631b31adf1..7c01735d1219 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -52,7 +52,8 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (!attr->non_res) { lsize = le32_to_cpu(attr->res.data_size); - le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); + /* attr is resident: lsize < record_size (1K or 4K) */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; @@ -80,7 +81,17 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (err < 0) goto out; - le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); + /* attr is nonresident. + * The worst case: + * 1T (2^40) extremely fragmented file. + * cluster = 4K (2^12) => 2^28 fragments + * 2^9 fragments per one record => 2^19 records + * 2^5 bytes of ATTR_LIST_ENTRY per one record => 2^24 bytes. + * + * the result is 16M bytes per attribute list. + * Use kvmalloc to allocate in range [several Kbytes - dozen Mbytes] + */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 107e808e06ea..63f14a0232f6 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -125,6 +125,7 @@ void wnd_close(struct wnd_bitmap *wnd) struct rb_node *node, *next; kfree(wnd->free_bits); + wnd->free_bits = NULL; run_close(&wnd->run); node = rb_first(&wnd->start_tree); @@ -659,7 +660,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) wnd->bits_last = wbits; wnd->free_bits = - kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); + kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO); + if (!wnd->free_bits) return -ENOMEM; diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 063a6654199b..ec0566b322d5 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -309,7 +309,11 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; } - dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + /* NTFS: symlinks are "dir + reparse" or "file + reparse" */ + if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) + dt_type = DT_LNK; + else + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 962f12ce6c0a..1f7a194983c5 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -745,8 +745,8 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, unsigned int flags) + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct inode *inode = in->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 2b85cb10f0be..dad976a68985 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -2148,7 +2148,7 @@ out1: for (i = 0; i < pages_per_frame; i++) { pg = pages[i]; - if (i == idx) + if (i == idx || !pg) continue; unlock_page(pg); put_page(pg); @@ -3208,6 +3208,12 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup))) continue; + /* Check simple case when parent inode equals current inode. */ + if (ino_get(&fname->home) == ni->vfs_inode.i_ino) { + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + continue; + } + /* ntfs_iget5 may sleep. */ dir = ntfs_iget5(sb, &fname->home, NULL); if (IS_ERR(dir)) { diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 12f28cdf5c83..98ccb6650858 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -2168,8 +2168,10 @@ file_is_valid: if (!page) { page = kmalloc(log->page_size, GFP_NOFS); - if (!page) - return -ENOMEM; + if (!page) { + err = -ENOMEM; + goto out; + } } /* diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 33afee0f5559..fbfe21dbb425 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -983,18 +983,11 @@ out: if (err) return err; - mark_inode_dirty(&ni->vfs_inode); + mark_inode_dirty_sync(&ni->vfs_inode); /* verify(!ntfs_update_mftmirr()); */ - /* - * If we used wait=1, sync_inode_metadata waits for the io for the - * inode to finish. It hangs when media is removed. - * So wait=0 is sent down to sync_inode_metadata - * and filemap_fdatawrite is used for the data blocks. - */ - err = sync_inode_metadata(&ni->vfs_inode, 0); - if (!err) - err = filemap_fdatawrite(ni->vfs_inode.i_mapping); + /* write mft record on disk. */ + err = _ni_write_inode(&ni->vfs_inode, 1); return err; } @@ -2461,10 +2454,12 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) { CLST end, i, zone_len, zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; + bool dirty = false; down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); if (!wnd_is_used(wnd, lcn, len)) { - ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + /* mark volume as dirty out of wnd->rw_lock */ + dirty = true; end = lcn + len; len = 0; @@ -2518,6 +2513,8 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) out: up_write(&wnd->rw_lock); + if (dirty) + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } /* diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 124c6e822623..cf92b2433f7a 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -729,6 +729,9 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, u32 total = le32_to_cpu(hdr->total); u16 offs[128]; + if (unlikely(!cmp)) + return NULL; + fill_table: if (end > total) return NULL; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index eb2ed0701495..d6d021e19aaa 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -170,8 +170,8 @@ next_attr: nt2kernel(std5->cr_time, &ni->i_crtime); #endif nt2kernel(std5->a_time, &inode->i_atime); - ctime = inode_get_ctime(inode); nt2kernel(std5->c_time, &ctime); + inode_set_ctime_to_ts(inode, ctime); nt2kernel(std5->m_time, &inode->i_mtime); ni->std_fa = std5->fa; @@ -1660,7 +1660,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */ - inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, ni->i_crtime); + inode->i_atime = inode->i_mtime = + inode_set_ctime_to_ts(inode, ni->i_crtime); dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime); mark_inode_dirty(dir); diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index ad430d50bd79..eedacf94edd8 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de) err = ntfs_link_inode(inode, de); if (!err) { - dir->i_mtime = inode_set_ctime_to_ts(inode, - inode_set_ctime_current(dir)); + dir->i_mtime = inode_set_ctime_to_ts( + inode, inode_set_ctime_current(dir)); mark_inode_dirty(inode); mark_inode_dirty(dir); d_instantiate(de, inode); @@ -373,7 +373,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, #ifdef CONFIG_NTFS3_FS_POSIX_ACL if (IS_POSIXACL(dir)) { - /* + /* * Load in cache current acl to avoid ni_lock(dir): * ntfs_create_inode -> ntfs_init_acl -> posix_acl_create -> * ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 98b76d1b09e7..86aecbb01a92 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -847,7 +847,7 @@ struct OBJECT_ID { // Birth Volume Id is the Object Id of the Volume on. // which the Object Id was allocated. It never changes. struct GUID BirthVolumeId; //0x10: - + // Birth Object Id is the first Object Id that was // ever assigned to this MFT Record. I.e. If the Object Id // is changed for some reason, this field will reflect the diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 629403ede6e5..0e6a2777870c 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -42,9 +42,11 @@ enum utf16_endian; #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 +#define MAXIMUM_SHIFT_BYTES_PER_MFT 12 #define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512) #define MAXIMUM_BYTES_PER_INDEX 4096 +#define MAXIMUM_SHIFT_BYTES_PER_INDEX 12 #define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512) /* NTFS specific error code when fixup failed. */ @@ -495,8 +497,6 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, - CLST len); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index c12ebffc94da..53629b1f65e9 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -189,12 +189,19 @@ out: return err; } +/* + * mi_enum_attr - start/continue attributes enumeration in record. + * + * NOTE: mi->mrec - memory of size sbi->record_size + * here we sure that mi->mrec->total == sbi->record_size (see mi_read) + */ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) { const struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); - u32 t32, off, asize; + u32 t32, off, asize, prev_type; u16 t16; + u64 data_size, alloc_size, tot_size; if (!attr) { u32 total = le32_to_cpu(rec->total); @@ -213,6 +220,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (!is_rec_inuse(rec)) return NULL; + prev_type = 0; attr = Add2Ptr(rec, off); } else { /* Check if input attr inside record. */ @@ -226,11 +234,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return NULL; } - if (off + asize < off) { - /* Overflow check. */ + /* Overflow check. */ + if (off + asize < off) return NULL; - } + prev_type = le32_to_cpu(attr->type); attr = Add2Ptr(attr, asize); off += asize; } @@ -250,7 +258,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* 0x100 is last known attribute for now. */ t32 = le32_to_cpu(attr->type); - if ((t32 & 0xf) || (t32 > 0x100)) + if (!t32 || (t32 & 0xf) || (t32 > 0x100)) + return NULL; + + /* attributes in record must be ordered by type */ + if (t32 < prev_type) return NULL; /* Check overflow and boundary. */ @@ -259,16 +271,15 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* Check size of attribute. */ if (!attr->non_res) { + /* Check resident fields. */ if (asize < SIZEOF_RESIDENT) return NULL; t16 = le16_to_cpu(attr->res.data_off); - if (t16 > asize) return NULL; - t32 = le32_to_cpu(attr->res.data_size); - if (t16 + t32 > asize) + if (t16 + le32_to_cpu(attr->res.data_size) > asize) return NULL; t32 = sizeof(short) * attr->name_len; @@ -278,21 +289,52 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return attr; } - /* Check some nonresident fields. */ - if (attr->name_len && - le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > - le16_to_cpu(attr->nres.run_off)) { + /* Check nonresident fields. */ + if (attr->non_res != 1) + return NULL; + + t16 = le16_to_cpu(attr->nres.run_off); + if (t16 > asize) + return NULL; + + t32 = sizeof(short) * attr->name_len; + if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) + return NULL; + + /* Check start/end vcn. */ + if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1) return NULL; - } - if (attr->nres.svcn || !is_attr_ext(attr)) { + data_size = le64_to_cpu(attr->nres.data_size); + if (le64_to_cpu(attr->nres.valid_size) > data_size) + return NULL; + + alloc_size = le64_to_cpu(attr->nres.alloc_size); + if (data_size > alloc_size) + return NULL; + + t32 = mi->sbi->cluster_mask; + if (alloc_size & t32) + return NULL; + + if (!attr->nres.svcn && is_attr_ext(attr)) { + /* First segment of sparse/compressed attribute */ + if (asize + 8 < SIZEOF_NONRESIDENT_EX) + return NULL; + + tot_size = le64_to_cpu(attr->nres.total_size); + if (tot_size & t32) + return NULL; + + if (tot_size > alloc_size) + return NULL; + } else { if (asize + 8 < SIZEOF_NONRESIDENT) return NULL; if (attr->nres.c_unit) return NULL; - } else if (asize + 8 < SIZEOF_NONRESIDENT_EX) - return NULL; + } return attr; } diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index cfec5e0c7f66..f763e3256ccc 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -453,15 +453,23 @@ static struct proc_dir_entry *proc_info_root; * ntfs3.1 * cluster size * number of clusters + * total number of mft records + * number of used mft records ~= number of files + folders + * real state of ntfs "dirty"/"clean" + * current state of ntfs "dirty"/"clean" */ static int ntfs3_volinfo(struct seq_file *m, void *o) { struct super_block *sb = m->private; struct ntfs_sb_info *sbi = sb->s_fs_info; - seq_printf(m, "ntfs%d.%d\n%u\n%zu\n", sbi->volume.major_ver, - sbi->volume.minor_ver, sbi->cluster_size, - sbi->used.bitmap.nbits); + seq_printf(m, "ntfs%d.%d\n%u\n%zu\n\%zu\n%zu\n%s\n%s\n", + sbi->volume.major_ver, sbi->volume.minor_ver, + sbi->cluster_size, sbi->used.bitmap.nbits, + sbi->mft.bitmap.nbits, + sbi->mft.bitmap.nbits - wnd_zeroes(&sbi->mft.bitmap), + sbi->volume.real_dirty ? "dirty" : "clean", + (sbi->volume.flags & VOLUME_FLAG_DIRTY) ? "dirty" : "clean"); return 0; } @@ -488,9 +496,13 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, { int err; struct super_block *sb = pde_data(file_inode(file)); - struct ntfs_sb_info *sbi = sb->s_fs_info; ssize_t ret = count; - u8 *label = kmalloc(count, GFP_NOFS); + u8 *label; + + if (sb_rdonly(sb)) + return -EROFS; + + label = kmalloc(count, GFP_NOFS); if (!label) return -ENOMEM; @@ -502,7 +514,7 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, while (ret > 0 && label[ret - 1] == '\n') ret -= 1; - err = ntfs_set_label(sbi, label, ret); + err = ntfs_set_label(sb->s_fs_info, label, ret); if (err < 0) { ntfs_err(sb, "failed (%d) to write label", err); @@ -576,20 +588,30 @@ static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi) wnd_close(&sbi->mft.bitmap); wnd_close(&sbi->used.bitmap); - if (sbi->mft.ni) + if (sbi->mft.ni) { iput(&sbi->mft.ni->vfs_inode); + sbi->mft.ni = NULL; + } - if (sbi->security.ni) + if (sbi->security.ni) { iput(&sbi->security.ni->vfs_inode); + sbi->security.ni = NULL; + } - if (sbi->reparse.ni) + if (sbi->reparse.ni) { iput(&sbi->reparse.ni->vfs_inode); + sbi->reparse.ni = NULL; + } - if (sbi->objid.ni) + if (sbi->objid.ni) { iput(&sbi->objid.ni->vfs_inode); + sbi->objid.ni = NULL; + } - if (sbi->volume.ni) + if (sbi->volume.ni) { iput(&sbi->volume.ni->vfs_inode); + sbi->volume.ni = NULL; + } ntfs_update_mftmirr(sbi, 0); @@ -836,7 +858,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; - u64 sectors, clusters, mlcn, mlcn2; + u64 sectors, clusters, mlcn, mlcn2, dev_size0; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; @@ -845,6 +867,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u32 boot_off = 0; const char *hint = "Primary boot"; + /* Save original dev_size. Used with alternative boot. */ + dev_size0 = dev_size; + sbi->volume.blocks = dev_size >> PAGE_SHIFT; bh = ntfs_bread(sb, 0); @@ -853,6 +878,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, check_boot: err = -EINVAL; + + /* Corrupted image; do not read OOB */ + if (bh->b_size - sizeof(*boot) < boot_off) + goto out; + boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off); if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) { @@ -899,9 +929,17 @@ check_boot: goto out; } - sbi->record_size = record_size = - boot->record_size < 0 ? 1 << (-boot->record_size) : - (u32)boot->record_size << cluster_bits; + if (boot->record_size >= 0) { + record_size = (u32)boot->record_size << cluster_bits; + } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) { + record_size = 1u << (-boot->record_size); + } else { + ntfs_err(sb, "%s: invalid record size %d.", hint, + boot->record_size); + goto out; + } + + sbi->record_size = record_size; sbi->record_bits = blksize_bits(record_size); sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes @@ -918,9 +956,15 @@ check_boot: goto out; } - sbi->index_size = boot->index_size < 0 ? - 1u << (-boot->index_size) : - (u32)boot->index_size << cluster_bits; + if (boot->index_size >= 0) { + sbi->index_size = (u32)boot->index_size << cluster_bits; + } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) { + sbi->index_size = 1u << (-boot->index_size); + } else { + ntfs_err(sb, "%s: invalid index size %d.", hint, + boot->index_size); + goto out; + } /* Check index record size. */ if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) { @@ -1055,17 +1099,17 @@ check_boot: if (bh->b_blocknr && !sb_rdonly(sb)) { /* - * Alternative boot is ok but primary is not ok. - * Do not update primary boot here 'cause it may be faked boot. - * Let ntfs to be mounted and update boot later. - */ + * Alternative boot is ok but primary is not ok. + * Do not update primary boot here 'cause it may be faked boot. + * Let ntfs to be mounted and update boot later. + */ *boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN); } out: - if (err == -EINVAL && !bh->b_blocknr && dev_size > PAGE_SHIFT) { + if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); - u64 lbo = dev_size - sizeof(*boot); + u64 lbo = dev_size0 - sizeof(*boot); /* * Try alternative boot (last sector) @@ -1079,6 +1123,7 @@ out: boot_off = lbo & (block_size - 1); hint = "Alternative boot"; + dev_size = dev_size0; /* restore original size. */ goto check_boot; } brelse(bh); @@ -1367,7 +1412,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) } bytes = inode->i_size; - sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN); + sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL); if (!t) { err = -ENOMEM; goto put_inode_out; @@ -1521,9 +1566,9 @@ load_root: if (boot2) { /* - * Alternative boot is ok but primary is not ok. - * Volume is recognized as NTFS. Update primary boot. - */ + * Alternative boot is ok but primary is not ok. + * Volume is recognized as NTFS. Update primary boot. + */ struct buffer_head *bh0 = sb_getblk(sb, 0); if (bh0) { if (buffer_locked(bh0)) @@ -1562,7 +1607,9 @@ load_root: put_inode_out: iput(inode); out: + ntfs3_put_sbi(sbi); kfree(boot2); + ntfs3_put_sbi(sbi); return err; } @@ -1756,7 +1803,6 @@ static int __init init_ntfs_fs(void) if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); - #ifdef CONFIG_PROC_FS /* Create "/proc/fs/ntfs3" */ proc_info_root = proc_mkdir("fs/ntfs3", NULL); @@ -1798,7 +1844,6 @@ static void __exit exit_ntfs_fs(void) if (proc_info_root) remove_proc_entry("fs/ntfs3", NULL); #endif - } MODULE_LICENSE("GPL"); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 29fd391899e5..4920548192a0 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -211,7 +211,8 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, size = le32_to_cpu(info->size); /* Enumerate all xattrs. */ - for (ret = 0, off = 0; off < size; off += ea_size) { + ret = 0; + for (off = 0; off + sizeof(struct EA_FULL) < size; off += ea_size) { ea = Add2Ptr(ea_all, off); ea_size = unpacked_ea_size(ea); @@ -219,6 +220,10 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, break; if (buffer) { + /* Check if we can use field ea->name */ + if (off + ea_size > size) + break; + if (ret + ea->name_len + 1 > bytes_per_buffer) { err = -ERANGE; goto out; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index d1761ec5866a..ada3fcc9c6d5 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -337,7 +337,7 @@ static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry, { struct iattr attr = { .ia_valid = - ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME, .ia_atime = stat->atime, .ia_mtime = stat->mtime, }; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index c8c8588bd98c..26b782c53910 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -188,7 +188,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) /* Lower file handle for non-upper non-decodable */ if (!ovl_dentry_upper(dentry) && !decodable) - return 0; + return 1; /* Upper file handle for pure upper */ if (!ovl_dentry_lower(dentry)) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 4193633c4c7a..8be4dc050d1e 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -341,7 +341,6 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - real.flags = 0; aio_req->orig_iocb = iocb; kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_complete = ovl_aio_rw_complete; @@ -391,6 +390,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); + /* + * Overlayfs doesn't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + ifl &= ~IOCB_DIO_CALLER_COMP; + old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { file_start_write(real.file); @@ -407,7 +412,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - real.flags = 0; aio_req->orig_iocb = iocb; kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_flags = ifl; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index e9539f98e86a..d82d2a043da2 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -8,6 +8,7 @@ struct ovl_config { char *upperdir; char *workdir; + char **lowerdirs; bool default_permissions; int redirect_mode; int verity_mode; @@ -39,17 +40,8 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; - char *name; }; -/* - * ovl_free_fs() relies on @mnt being the first member when unmounting - * the private mounts created for each layer. Let's check both the - * offset and type. - */ -static_assert(offsetof(struct ovl_layer, mnt) == 0); -static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *)); - struct ovl_path { const struct ovl_layer *layer; struct dentry *dentry; diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index b9355bb6d75a..f6ff23fd101c 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -157,6 +157,34 @@ const struct fs_parameter_spec ovl_parameter_spec[] = { {} }; +static char *ovl_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + for (p = sbegin; *p; p++) { + if (*p == '\\') { + p++; + if (!*p) + break; + } else if (*p == ',') { + *p = '\0'; + *s = p + 1; + return sbegin; + } + } + *s = NULL; + return sbegin; +} + +static int ovl_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, ovl_next_opt); +} + static ssize_t ovl_parse_param_split_lowerdirs(char *str) { ssize_t nr_layers = 1, nr_colons = 0; @@ -164,7 +192,8 @@ static ssize_t ovl_parse_param_split_lowerdirs(char *str) for (s = d = str;; s++, d++) { if (*s == '\\') { - s++; + /* keep esc chars in split lowerdir */ + *d++ = *s++; } else if (*s == ':') { bool next_colon = (*(s + 1) == ':'); @@ -239,7 +268,7 @@ static void ovl_unescape(char *s) } } -static int ovl_mount_dir(const char *name, struct path *path) +static int ovl_mount_dir(const char *name, struct path *path, bool upper) { int err = -ENOMEM; char *tmp = kstrdup(name, GFP_KERNEL); @@ -248,7 +277,7 @@ static int ovl_mount_dir(const char *name, struct path *path) ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); - if (!err && path->dentry->d_flags & DCACHE_OP_REAL) { + if (!err && upper && path->dentry->d_flags & DCACHE_OP_REAL) { pr_err("filesystem on '%s' not supported as upperdir\n", tmp); path_put_init(path); @@ -269,7 +298,7 @@ static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc, struct path path; char *dup; - err = ovl_mount_dir(name, &path); + err = ovl_mount_dir(name, &path, true); if (err) return err; @@ -321,12 +350,6 @@ static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx) * Set "/lower1", "/lower2", and "/lower3" as lower layers and * "/data1" and "/data2" as data lower layers. Any existing lower * layers are replaced. - * (2) lowerdir=:/lower4 - * Append "/lower4" to current stack of lower layers. This requires - * that there already is at least one lower layer configured. - * (3) lowerdir=::/lower5 - * Append data "/lower5" as data lower layer. This requires that - * there's at least one regular lower layer present. */ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) { @@ -348,49 +371,9 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) return 0; } - if (strncmp(name, "::", 2) == 0) { - /* - * This is a data layer. - * There must be at least one regular lower layer - * specified. - */ - if (ctx->nr == 0) { - pr_err("data lower layers without regular lower layers not allowed"); - return -EINVAL; - } - - /* Skip the leading "::". */ - name += 2; - data_layer = true; - /* - * A data layer is automatically an append as there - * must've been at least one regular lower layer. - */ - append = true; - } else if (*name == ':') { - /* - * This is a regular lower layer. - * If users want to append a layer enforce that they - * have already specified a first layer before. It's - * better to be strict. - */ - if (ctx->nr == 0) { - pr_err("cannot append layer if no previous layer has been specified"); - return -EINVAL; - } - - /* - * Once a sequence of data layers has started regular - * lower layers are forbidden. - */ - if (ctx->nr_data > 0) { - pr_err("regular lower layers cannot follow data lower layers"); - return -EINVAL; - } - - /* Skip the leading ":". */ - name++; - append = true; + if (*name == ':') { + pr_err("cannot append lower layer"); + return -EINVAL; } dup = kstrdup(name, GFP_KERNEL); @@ -472,7 +455,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) l = &ctx->lower[nr]; memset(l, 0, sizeof(*l)); - err = ovl_mount_dir_noesc(dup_iter, &l->path); + err = ovl_mount_dir(dup_iter, &l->path, false); if (err) goto out_put; @@ -682,6 +665,7 @@ static int ovl_reconfigure(struct fs_context *fc) } static const struct fs_context_operations ovl_context_ops = { + .parse_monolithic = ovl_parse_monolithic, .parse_param = ovl_parse_param, .get_tree = ovl_get_tree, .reconfigure = ovl_reconfigure, @@ -752,12 +736,12 @@ void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); - /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ - mounts = (struct vfsmount **) ofs->layers; + /* Reuse ofs->config.lowerdirs as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->config.lowerdirs; for (i = 0; i < ofs->numlayer; i++) { iput(ofs->layers[i].trap); + kfree(ofs->config.lowerdirs[i]); mounts[i] = ofs->layers[i].mnt; - kfree(ofs->layers[i].name); } kern_unmount_array(mounts, ofs->numlayer); kfree(ofs->layers); @@ -765,6 +749,7 @@ void ovl_free_fs(struct ovl_fs *ofs) free_anon_bdev(ofs->fs[i].pseudo_dev); kfree(ofs->fs); + kfree(ofs->config.lowerdirs); kfree(ofs->config.upperdir); kfree(ofs->config.workdir); if (ofs->creator_cred) @@ -949,16 +934,23 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) struct super_block *sb = dentry->d_sb; struct ovl_fs *ofs = OVL_FS(sb); size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer; - const struct ovl_layer *data_layers = &ofs->layers[nr_merged_lower]; - - /* ofs->layers[0] is the upper layer */ - seq_printf(m, ",lowerdir=%s", ofs->layers[1].name); - /* dump regular lower layers */ - for (nr = 2; nr < nr_merged_lower; nr++) - seq_printf(m, ":%s", ofs->layers[nr].name); - /* dump data lower layers */ - for (nr = 0; nr < ofs->numdatalayer; nr++) - seq_printf(m, "::%s", data_layers[nr].name); + + /* + * lowerdirs[] starts from offset 1, then + * >= 0 regular lower layers prefixed with : and + * >= 0 data-only lower layers prefixed with :: + * + * we need to escase comma and space like seq_show_option() does and + * we also need to escape the colon separator from lowerdir paths. + */ + seq_puts(m, ",lowerdir="); + for (nr = 1; nr < ofs->numlayer; nr++) { + if (nr > 1) + seq_putc(m, ':'); + if (nr >= nr_merged_lower) + seq_putc(m, ':'); + seq_escape(m, ofs->config.lowerdirs[nr], ":, \t\n\\"); + } if (ofs->config.upperdir) { seq_show_option(m, "upperdir", ofs->config.upperdir); seq_show_option(m, "workdir", ofs->config.workdir); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index def266b5e2a3..3fa2416264a4 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -104,8 +104,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int flags, bool weak) { - struct ovl_entry *oe = OVL_E(dentry); - struct ovl_path *lowerstack = ovl_lowerstack(oe); + struct ovl_entry *oe; + struct ovl_path *lowerstack; struct inode *inode = d_inode_rcu(dentry); struct dentry *upper; unsigned int i; @@ -115,6 +115,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, if (!inode) return -ECHILD; + oe = OVL_I_E(inode); + lowerstack = ovl_lowerstack(oe); upper = ovl_i_dentry_upper(inode); if (upper) ret = ovl_revalidate_real(upper, flags, weak); @@ -167,6 +169,7 @@ static void ovl_free_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); kfree(oi->redirect); + kfree(oi->oe); mutex_destroy(&oi->lock); kmem_cache_free(ovl_inode_cachep, oi); } @@ -176,7 +179,7 @@ static void ovl_destroy_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); dput(oi->__upperdentry); - ovl_free_entry(oi->oe); + ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe)); if (S_ISDIR(inode->i_mode)) ovl_dir_cache_free(inode); else @@ -569,11 +572,6 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, upper_layer->idx = 0; upper_layer->fsid = 0; - err = -ENOMEM; - upper_layer->name = kstrdup(ofs->config.upperdir, GFP_KERNEL); - if (!upper_layer->name) - goto out; - /* * Inherit SB_NOSEC flag from upperdir. * @@ -1122,7 +1120,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, layers[ofs->numlayer].idx = ofs->numlayer; layers[ofs->numlayer].fsid = fsid; layers[ofs->numlayer].fs = &ofs->fs[fsid]; - layers[ofs->numlayer].name = l->name; + /* Store for printing lowerdir=... in ovl_show_options() */ + ofs->config.lowerdirs[ofs->numlayer] = l->name; l->name = NULL; ofs->numlayer++; ofs->fs[fsid].is_lower = true; @@ -1367,8 +1366,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (!layers) goto out_err; + ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL); + if (!ofs->config.lowerdirs) { + kfree(layers); + goto out_err; + } ofs->layers = layers; - /* Layer 0 is reserved for upper even if there's no upper */ + /* + * Layer 0 is reserved for upper even if there's no upper. + * For consistency, config.lowerdirs[0] is NULL. + */ ofs->numlayer = 1; sb->s_stack_depth = 0; diff --git a/fs/pipe.c b/fs/pipe.c index 6c1a9b1db907..139190165a1c 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -537,7 +537,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) break; } ret += copied; - buf->offset = 0; buf->len = copied; if (!iov_iter_count(from)) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9dda7e54b2d0..9a8f32f21ff5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -289,9 +289,7 @@ struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; -#ifdef CONFIG_MMU struct vma_iterator iter; -#endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a8ac0dd8041e..7cebd397cc26 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -175,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p) return nommu_vma_show(m, _p); } -static void *m_start(struct seq_file *m, loff_t *pos) +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -1UL; + } + + return vma; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; + unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned long addr = *pos; - /* See m_next(). Zero at the start or after lseek. */ - if (addr == -1UL) + /* See proc_get_vma(). Zero at the start or after lseek. */ + if (last_addr == -1UL) return NULL; /* pin the task and mm whilst we play with them */ @@ -192,44 +205,41 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (mmap_read_lock_killable(mm)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } - /* start the next element from addr */ - vma = find_vma(mm, addr); - if (vma) - return vma; + vma_iter_init(&priv->iter, mm, last_addr); - mmap_read_unlock(mm); - mmput(mm); - return NULL; + return proc_get_vma(priv, ppos); } -static void m_stop(struct seq_file *m, void *_vml) +static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(_vml)) { - mmap_read_unlock(priv->mm); - mmput(priv->mm); - } - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } -static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *ppos) { - struct vm_area_struct *vma = _p; - - *pos = vma->vm_end; - return find_vma(vma->vm_mm, vma->vm_end); + return proc_get_vma(m->private, ppos); } static const struct seq_operations proc_pid_maps_ops = { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9e72bfe8bbad..31e897ad5e6a 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -233,19 +233,18 @@ static void put_quota_format(struct quota_format_type *fmt) * All dquots are placed to the end of inuse_list when first created, and this * list is used for invalidate operation, which must look at every dquot. * - * When the last reference of a dquot will be dropped, the dquot will be - * added to releasing_dquots. We'd then queue work item which would call + * When the last reference of a dquot is dropped, the dquot is added to + * releasing_dquots. We'll then queue work item which will call * synchronize_srcu() and after that perform the final cleanup of all the - * dquots on the list. Both releasing_dquots and free_dquots use the - * dq_free list_head in the dquot struct. When a dquot is removed from - * releasing_dquots, a reference count is always subtracted, and if - * dq_count == 0 at that point, the dquot will be added to the free_dquots. + * dquots on the list. Each cleaned up dquot is moved to free_dquots list. + * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot + * struct. * - * Unused dquots (dq_count == 0) are added to the free_dquots list when freed, - * and this list is searched whenever we need an available dquot. Dquots are - * removed from the list as soon as they are used again, and - * dqstats.free_dquots gives the number of dquots on the list. When - * dquot is invalidated it's completely released from memory. + * Unused and cleaned up dquots are in the free_dquots list and this list is + * searched whenever we need an available dquot. Dquots are removed from the + * list as soon as they are used again and dqstats.free_dquots gives the number + * of dquots on the list. When dquot is invalidated it's completely released + * from memory. * * Dirty dquots are added to the dqi_dirty_list of quota_info when mark * dirtied, and this list is searched when writing dirty dquots back to @@ -321,6 +320,7 @@ static inline void put_dquot_last(struct dquot *dquot) static inline void put_releasing_dquots(struct dquot *dquot) { list_add_tail(&dquot->dq_free, &releasing_dquots); + set_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void remove_free_dquot(struct dquot *dquot) @@ -328,8 +328,10 @@ static inline void remove_free_dquot(struct dquot *dquot) if (list_empty(&dquot->dq_free)) return; list_del_init(&dquot->dq_free); - if (!atomic_read(&dquot->dq_count)) + if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags)) dqstats_dec(DQST_FREE_DQUOTS); + else + clear_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void put_inuse(struct dquot *dquot) @@ -581,12 +583,6 @@ restart: continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { - /* dquot in releasing_dquots, flush and retry */ - if (!list_empty(&dquot->dq_free)) { - spin_unlock(&dq_list_lock); - goto restart; - } - atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); /* @@ -606,6 +602,15 @@ restart: goto restart; } /* + * The last user already dropped its reference but dquot didn't + * get fully cleaned up yet. Restart the scan which flushes the + * work cleaning up released dquots. + */ + if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); + goto restart; + } + /* * Quota now has no users and it has been written on last * dqput() */ @@ -696,6 +701,13 @@ int dquot_writeback_dquots(struct super_block *sb, int type) dq_dirty); WARN_ON(!dquot_active(dquot)); + /* If the dquot is releasing we should not touch it */ + if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); + flush_delayed_work("a_release_work); + spin_lock(&dq_list_lock); + continue; + } /* Now we have active dquot from which someone is * holding reference so we can safely just increase @@ -809,18 +821,18 @@ static void quota_release_workfn(struct work_struct *work) /* Exchange the list head to avoid livelock. */ list_replace_init(&releasing_dquots, &rls_head); spin_unlock(&dq_list_lock); + synchronize_srcu(&dquot_srcu); restart: - synchronize_srcu(&dquot_srcu); spin_lock(&dq_list_lock); while (!list_empty(&rls_head)) { dquot = list_first_entry(&rls_head, struct dquot, dq_free); - /* Dquot got used again? */ - if (atomic_read(&dquot->dq_count) > 1) { - remove_free_dquot(dquot); - atomic_dec(&dquot->dq_count); - continue; - } + WARN_ON_ONCE(atomic_read(&dquot->dq_count)); + /* + * Note that DQ_RELEASING_B protects us from racing with + * invalidate_dquots() calls so we are safe to work with the + * dquot even after we drop dq_list_lock. + */ if (dquot_dirty(dquot)) { spin_unlock(&dq_list_lock); /* Commit dquot before releasing */ @@ -834,7 +846,6 @@ restart: } /* Dquot is inactive and clean, now move it to free list */ remove_free_dquot(dquot); - atomic_dec(&dquot->dq_count); put_dquot_last(dquot); } spin_unlock(&dq_list_lock); @@ -875,6 +886,7 @@ void dqput(struct dquot *dquot) BUG_ON(!list_empty(&dquot->dq_free)); #endif put_releasing_dquots(dquot); + atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); queue_delayed_work(system_unbound_wq, "a_release_work, 1); } @@ -963,7 +975,7 @@ we_slept: dqstats_inc(DQST_LOOKUPS); } /* Wait for dq_lock - after this we know that either dquot_release() is - * already finished or it will be canceled due to dq_count > 1 test */ + * already finished or it will be canceled due to dq_count > 0 test */ wait_on_dquot(dquot); /* Read the dquot / allocate space in quota file */ if (!dquot_active(dquot)) { diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index b81749492ef9..7d12b8c5b2fa 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -2699,7 +2699,7 @@ struct reiserfs_iget_args { #define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) #define journal_trans_half(blocksize) \ - ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32)) + ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32)) /* journal.c see journal.c for all the comments here */ @@ -2711,7 +2711,7 @@ struct reiserfs_journal_desc { __le32 j_len; __le32 j_mount_id; /* mount id of this trans */ - __le32 j_realblock[1]; /* real locations for each block */ + __le32 j_realblock[]; /* real locations for each block */ }; #define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id) @@ -2726,7 +2726,7 @@ struct reiserfs_journal_desc { struct reiserfs_journal_commit { __le32 j_trans_id; /* must match j_trans_id from the desc block */ __le32 j_len; /* ditto */ - __le32 j_realblock[1]; /* real locations for each block */ + __le32 j_realblock[]; /* real locations for each block */ }; #define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index b17f067e4ada..fe1bf5b6e0cb 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -15,6 +15,7 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); static void smb2_close_cached_fid(struct kref *ref); +static void cfids_laundromat_worker(struct work_struct *work); static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, const char *path, @@ -169,15 +170,18 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, return -ENOENT; } /* - * At this point we either have a lease already and we can just - * return it. If not we are guaranteed to be the only thread accessing - * this cfid. + * Return cached fid if it has a lease. Otherwise, it is either a new + * entry or laundromat worker removed it from @cfids->entries. Caller + * will put last reference if the latter. */ + spin_lock(&cfids->cfid_list_lock); if (cfid->has_lease) { + spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; kfree(utf16_path); return 0; } + spin_unlock(&cfids->cfid_list_lock); /* * Skip any prefix paths in @path as lookup_positive_unlocked() ends up @@ -294,9 +298,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, goto oshr_free; } } + spin_lock(&cfids->cfid_list_lock); cfid->dentry = dentry; cfid->time = jiffies; cfid->has_lease = true; + spin_unlock(&cfids->cfid_list_lock); oshr_free: kfree(utf16_path); @@ -305,24 +311,28 @@ oshr_free: free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); spin_lock(&cfids->cfid_list_lock); - if (rc && !cfid->has_lease) { - if (cfid->on_list) { - list_del(&cfid->entry); - cfid->on_list = false; - cfids->num_entries--; + if (!cfid->has_lease) { + if (rc) { + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + } + rc = -ENOENT; + } else { + /* + * We are guaranteed to have two references at this + * point. One for the caller and one for a potential + * lease. Release the Lease-ref so that the directory + * will be closed when the caller closes the cached + * handle. + */ + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); + goto out; } - rc = -ENOENT; } spin_unlock(&cfids->cfid_list_lock); - if (!rc && !cfid->has_lease) { - /* - * We are guaranteed to have two references at this point. - * One for the caller and one for a potential lease. - * Release the Lease-ref so that the directory will be closed - * when the caller closes the cached handle. - */ - kref_put(&cfid->refcount, smb2_close_cached_fid); - } if (rc) { if (cfid->is_open) SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, @@ -330,7 +340,7 @@ oshr_free: free_cached_dir(cfid); cfid = NULL; } - +out: if (rc == 0) { *ret_cfid = cfid; atomic_inc(&tcon->num_remote_opens); @@ -452,6 +462,9 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) struct cached_fid *cfid, *q; LIST_HEAD(entry); + if (cfids == NULL) + return; + spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { list_move(&cfid->entry, &entry); @@ -569,53 +582,51 @@ static void free_cached_dir(struct cached_fid *cfid) kfree(cfid); } -static int -cifs_cfids_laundromat_thread(void *p) +static void cfids_laundromat_worker(struct work_struct *work) { - struct cached_fids *cfids = p; + struct cached_fids *cfids; struct cached_fid *cfid, *q; - struct list_head entry; + LIST_HEAD(entry); - while (!kthread_should_stop()) { - ssleep(1); - INIT_LIST_HEAD(&entry); - if (kthread_should_stop()) - return 0; - spin_lock(&cfids->cfid_list_lock); - list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { - if (time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) { - list_del(&cfid->entry); - list_add(&cfid->entry, &entry); - cfids->num_entries--; - } - } - spin_unlock(&cfids->cfid_list_lock); + cfids = container_of(work, struct cached_fids, laundromat_work.work); - list_for_each_entry_safe(cfid, q, &entry, entry) { + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + if (cfid->time && + time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) { cfid->on_list = false; - list_del(&cfid->entry); + list_move(&cfid->entry, &entry); + cfids->num_entries--; + /* To prevent race with smb2_cached_lease_break() */ + kref_get(&cfid->refcount); + } + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + /* + * Cancel and wait for the work to finish in case we are racing + * with it. + */ + cancel_work_sync(&cfid->lease_break); + if (cfid->has_lease) { /* - * Cancel, and wait for the work to finish in - * case we are racing with it. + * Our lease has not yet been cancelled from the server + * so we need to drop the reference. */ - cancel_work_sync(&cfid->lease_break); - if (cfid->has_lease) { - /* - * We lease has not yet been cancelled from - * the server so we need to drop the reference. - */ - spin_lock(&cfids->cfid_list_lock); - cfid->has_lease = false; - spin_unlock(&cfids->cfid_list_lock); - kref_put(&cfid->refcount, smb2_close_cached_fid); - } + spin_lock(&cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); } + /* Drop the extra reference opened above */ + kref_put(&cfid->refcount, smb2_close_cached_fid); } - - return 0; + queue_delayed_work(cifsiod_wq, &cfids->laundromat_work, + dir_cache_timeout * HZ); } - struct cached_fids *init_cached_dirs(void) { struct cached_fids *cfids; @@ -626,19 +637,10 @@ struct cached_fids *init_cached_dirs(void) spin_lock_init(&cfids->cfid_list_lock); INIT_LIST_HEAD(&cfids->entries); - /* - * since we're in a cifs function already, we know that - * this will succeed. No need for try_module_get(). - */ - __module_get(THIS_MODULE); - cfids->laundromat = kthread_run(cifs_cfids_laundromat_thread, - cfids, "cifsd-cfid-laundromat"); - if (IS_ERR(cfids->laundromat)) { - cifs_dbg(VFS, "Failed to start cfids laundromat thread.\n"); - kfree(cfids); - module_put(THIS_MODULE); - return NULL; - } + INIT_DELAYED_WORK(&cfids->laundromat_work, cfids_laundromat_worker); + queue_delayed_work(cifsiod_wq, &cfids->laundromat_work, + dir_cache_timeout * HZ); + return cfids; } @@ -651,11 +653,10 @@ void free_cached_dirs(struct cached_fids *cfids) struct cached_fid *cfid, *q; LIST_HEAD(entry); - if (cfids->laundromat) { - kthread_stop(cfids->laundromat); - cfids->laundromat = NULL; - module_put(THIS_MODULE); - } + if (cfids == NULL) + return; + + cancel_delayed_work_sync(&cfids->laundromat_work); spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index a82ff2cea789..81ba0fd5cc16 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -57,7 +57,7 @@ struct cached_fids { spinlock_t cfid_list_lock; int num_entries; struct list_head entries; - struct task_struct *laundromat; + struct delayed_work laundromat_work; }; extern struct cached_fids *init_cached_dirs(void); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 032d8716f671..02082621d8e0 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1807,6 +1807,7 @@ static inline bool is_retryable_error(int error) #define MID_RETRY_NEEDED 8 /* session closed while this request out */ #define MID_RESPONSE_MALFORMED 0x10 #define MID_SHUTDOWN 0x20 +#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */ /* Flags */ #define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ @@ -1943,7 +1944,7 @@ require use of the stronger protocol */ * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once * ->can_cache_brlcks * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc - * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc + * cached_fid->fid_mutex cifs_tcon->crfid tcon_info_alloc * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 7d8035846680..0c37eefa18a5 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -512,7 +512,7 @@ extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses); extern struct cifs_ses *sesInfoAlloc(void); extern void sesInfoFree(struct cifs_ses *); -extern struct cifs_tcon *tconInfoAlloc(void); +extern struct cifs_tcon *tcon_info_alloc(bool dir_leases_enabled); extern void tconInfoFree(struct cifs_tcon *); extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 687754791bf0..7b923e36501b 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1882,7 +1882,8 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) } } - tcon = tconInfoAlloc(); + /* no need to setup directory caching on IPC share, so pass in false */ + tcon = tcon_info_alloc(false); if (tcon == NULL) return -ENOMEM; @@ -2473,8 +2474,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) static struct cifs_tcon * cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) { - int rc, xid; struct cifs_tcon *tcon; + bool nohandlecache; + int rc, xid; tcon = cifs_find_tcon(ses, ctx); if (tcon) { @@ -2492,11 +2494,17 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out_fail; } - tcon = tconInfoAlloc(); + if (ses->server->dialect >= SMB20_PROT_ID && + (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)) + nohandlecache = ctx->nohandlecache; + else + nohandlecache = true; + tcon = tcon_info_alloc(!nohandlecache); if (tcon == NULL) { rc = -ENOMEM; goto out_fail; } + tcon->nohandlecache = nohandlecache; if (ctx->snapshot_time) { if (ses->server->vals->protocol_id == 0) { @@ -2658,10 +2666,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->nocase = ctx->nocase; tcon->broken_sparse_sup = ctx->no_sparse; tcon->max_cached_dirs = ctx->max_cached_dirs; - if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) - tcon->nohandlecache = ctx->nohandlecache; - else - tcon->nohandlecache = true; tcon->nodelete = ctx->nodelete; tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); @@ -2891,9 +2895,9 @@ bind_socket(struct TCP_Server_Info *server) if (server->srcaddr.ss_family != AF_UNSPEC) { /* Bind to the specified local IP address */ struct socket *socket = server->ssocket; - rc = socket->ops->bind(socket, - (struct sockaddr *) &server->srcaddr, - sizeof(server->srcaddr)); + rc = kernel_bind(socket, + (struct sockaddr *) &server->srcaddr, + sizeof(server->srcaddr)); if (rc < 0) { struct sockaddr_in *saddr4; struct sockaddr_in6 *saddr6; @@ -3042,8 +3046,8 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); - rc = socket->ops->connect(socket, saddr, slen, - server->noblockcnt ? O_NONBLOCK : 0); + rc = kernel_connect(socket, saddr, slen, + server->noblockcnt ? O_NONBLOCK : 0); /* * When mounting SMB root file systems, we do not want to block in * connect. Otherwise bail out and then let cifs_reconnect() perform diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index e45ce31bbda7..a3493da12ad1 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1541,6 +1541,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_parse_mount_err: kfree_sensitive(ctx->password); + ctx->password = NULL; return -EINVAL; } diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 366b755ca913..35b176457bbe 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -113,18 +113,22 @@ sesInfoFree(struct cifs_ses *buf_to_free) } struct cifs_tcon * -tconInfoAlloc(void) +tcon_info_alloc(bool dir_leases_enabled) { struct cifs_tcon *ret_buf; ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL); if (!ret_buf) return NULL; - ret_buf->cfids = init_cached_dirs(); - if (!ret_buf->cfids) { - kfree(ret_buf); - return NULL; + + if (dir_leases_enabled == true) { + ret_buf->cfids = init_cached_dirs(); + if (!ret_buf->cfids) { + kfree(ret_buf); + return NULL; + } } + /* else ret_buf->cfids is already set to NULL above */ atomic_inc(&tconInfoAllocCount); ret_buf->status = TID_NEW; diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index b41e2e872b22..0b89f7008ac0 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -539,6 +539,9 @@ static int parse_create_response(struct cifs_open_info_data *data, int rc = 0; switch (rsp->hdr.Status) { + case STATUS_IO_REPARSE_TAG_NOT_HANDLED: + reparse_point = true; + break; case STATUS_STOPPED_ON_SYMLINK: rc = smb2_parse_symlink_response(cifs_sb, iov, &data->symlink_target); diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index 194799ddd382..1a90dd78b238 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -877,8 +877,6 @@ static const struct status_to_posix_error smb2_error_map_table[] = { "STATUS_IO_REPARSE_TAG_MISMATCH"}, {STATUS_IO_REPARSE_DATA_INVALID, -EIO, "STATUS_IO_REPARSE_DATA_INVALID"}, - {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO, - "STATUS_IO_REPARSE_TAG_NOT_HANDLED"}, {STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO, "STATUS_REPARSE_POINT_NOT_RESOLVED"}, {STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 44d4943e9c56..c75a80bb6d9e 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -848,7 +848,7 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) iov[num].iov_base = create_posix_buf(mode); if (mode == ACL_NO_MODE) - cifs_dbg(FYI, "Invalid mode\n"); + cifs_dbg(FYI, "%s: no mode\n", __func__); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -3878,7 +3878,7 @@ void smb2_reconnect_server(struct work_struct *work) goto done; /* allocate a dummy tcon struct used for reconnect */ - tcon = tconInfoAlloc(); + tcon = tcon_info_alloc(false); if (!tcon) { resched = true; list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 2a2aec8c6112..94df9eec3d8d 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1401,10 +1401,13 @@ create_conn: server->smbd_conn = smbd_get_connection( server, (struct sockaddr *) &server->dstaddr); - if (server->smbd_conn) + if (server->smbd_conn) { cifs_dbg(VFS, "RDMA transport re-established\n"); - - return server->smbd_conn ? 0 : -ENOENT; + trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr); + return 0; + } + trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr); + return -ENOENT; } static void destroy_caches_and_workqueue(struct smbd_connection *info) diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index a7e4755bed0f..de199ec9f726 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -935,6 +935,8 @@ DEFINE_EVENT(smb3_connect_class, smb3_##name, \ TP_ARGS(hostname, conn_id, addr)) DEFINE_SMB3_CONNECT_EVENT(connect_done); +DEFINE_SMB3_CONNECT_EVENT(smbd_connect_done); +DEFINE_SMB3_CONNECT_EVENT(smbd_connect_err); DECLARE_EVENT_CLASS(smb3_connect_err_class, TP_PROTO(char *hostname, __u64 conn_id, diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index d52057a511ee..14710afdc2a3 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -35,6 +35,8 @@ void cifs_wake_up_task(struct mid_q_entry *mid) { + if (mid->mid_state == MID_RESPONSE_RECEIVED) + mid->mid_state = MID_RESPONSE_READY; wake_up_process(mid->callback_data); } @@ -87,7 +89,8 @@ static void __release_mid(struct kref *refcount) struct TCP_Server_Info *server = midEntry->server; if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) && - midEntry->mid_state == MID_RESPONSE_RECEIVED && + (midEntry->mid_state == MID_RESPONSE_RECEIVED || + midEntry->mid_state == MID_RESPONSE_READY) && server->ops->handle_cancelled_mid) server->ops->handle_cancelled_mid(midEntry, server); @@ -737,7 +740,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) int error; error = wait_event_state(server->response_q, - midQ->mid_state != MID_REQUEST_SUBMITTED, + midQ->mid_state != MID_REQUEST_SUBMITTED && + midQ->mid_state != MID_RESPONSE_RECEIVED, (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE)); if (error < 0) return -ERESTARTSYS; @@ -890,7 +894,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) spin_lock(&server->mid_lock); switch (mid->mid_state) { - case MID_RESPONSE_RECEIVED: + case MID_RESPONSE_READY: spin_unlock(&server->mid_lock); return rc; case MID_RETRY_NEEDED: @@ -989,6 +993,9 @@ cifs_compound_callback(struct mid_q_entry *mid) credits.instance = server->reconnect_instance; add_credits(server, &credits, mid->optype); + + if (mid->mid_state == MID_RESPONSE_RECEIVED) + mid->mid_state = MID_RESPONSE_READY; } static void @@ -1209,7 +1216,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, send_cancel(server, &rqst[i], midQ[i]); spin_lock(&server->mid_lock); midQ[i]->mid_flags |= MID_WAIT_CANCELLED; - if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED || + midQ[i]->mid_state == MID_RESPONSE_RECEIVED) { midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; credits[i].value = 0; @@ -1230,7 +1238,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } if (!midQ[i]->resp_buf || - midQ[i]->mid_state != MID_RESPONSE_RECEIVED) { + midQ[i]->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_dbg(FYI, "Bad MID state?\n"); goto out; @@ -1417,7 +1425,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc != 0) { send_cancel(server, &rqst, midQ); spin_lock(&server->mid_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; spin_unlock(&server->mid_lock); @@ -1434,7 +1443,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, } if (!midQ->resp_buf || !out_buf || - midQ->mid_state != MID_RESPONSE_RECEIVED) { + midQ->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_server_dbg(VFS, "Bad MID state?\n"); goto out; @@ -1558,14 +1567,16 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* Wait for a reply - allow signals to interrupt. */ rc = wait_event_interruptible(server->response_q, - (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) || + (!(midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED)) || ((server->tcpStatus != CifsGood) && (server->tcpStatus != CifsNew))); /* Were we interrupted by a signal ? */ spin_lock(&server->srv_lock); if ((rc == -ERESTARTSYS) && - (midQ->mid_state == MID_REQUEST_SUBMITTED) && + (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) && ((server->tcpStatus == CifsGood) || (server->tcpStatus == CifsNew))) { spin_unlock(&server->srv_lock); @@ -1596,7 +1607,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { send_cancel(server, &rqst, midQ); spin_lock(&server->mid_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; spin_unlock(&server->mid_lock); @@ -1616,7 +1628,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return rc; /* rcvd frame is ok */ - if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) { + if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_tcon_dbg(VFS, "Bad MID state?\n"); goto out; diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 0d990c2f33cd..4b38c3a285f6 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -84,6 +84,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) spin_lock_init(&conn->llist_lock); INIT_LIST_HEAD(&conn->lock_list); + init_rwsem(&conn->session_lock); + down_write(&conn_list_lock); list_add(&conn->conns_list, &conn_list); up_write(&conn_list_lock); @@ -197,6 +199,9 @@ int ksmbd_conn_write(struct ksmbd_work *work) if (work->send_no_response) return 0; + if (!work->iov_idx) + return -EINVAL; + ksmbd_conn_lock(conn); sent = conn->transport->ops->writev(conn->transport, work->iov, work->iov_cnt, diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index ab2583f030ce..3c005246a32e 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -50,6 +50,7 @@ struct ksmbd_conn { struct nls_table *local_nls; struct unicode_map *um; struct list_head conns_list; + struct rw_semaphore session_lock; /* smb session 1 per user */ struct xarray sessions; unsigned long last_active; diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c index 408cddf2f094..d2c81a8a11dd 100644 --- a/fs/smb/server/mgmt/tree_connect.c +++ b/fs/smb/server/mgmt/tree_connect.c @@ -73,7 +73,10 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, tree_conn->user = sess->user; tree_conn->share_conf = sc; + tree_conn->t_state = TREE_NEW; status.tree_conn = tree_conn; + atomic_set(&tree_conn->refcount, 1); + init_waitqueue_head(&tree_conn->refcount_q); ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn, GFP_KERNEL)); @@ -93,14 +96,33 @@ out_error: return status; } +void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon) +{ + /* + * Checking waitqueue to releasing tree connect on + * tree disconnect. waitqueue_active is safe because it + * uses atomic operation for condition. + */ + if (!atomic_dec_return(&tcon->refcount) && + waitqueue_active(&tcon->refcount_q)) + wake_up(&tcon->refcount_q); +} + int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *tree_conn) { int ret; + write_lock(&sess->tree_conns_lock); + xa_erase(&sess->tree_conns, tree_conn->id); + write_unlock(&sess->tree_conns_lock); + + if (!atomic_dec_and_test(&tree_conn->refcount)) + wait_event(tree_conn->refcount_q, + atomic_read(&tree_conn->refcount) == 0); + ret = ksmbd_ipc_tree_disconnect_request(sess->id, tree_conn->id); ksmbd_release_tree_conn_id(sess, tree_conn->id); - xa_erase(&sess->tree_conns, tree_conn->id); ksmbd_share_config_put(tree_conn->share_conf); kfree(tree_conn); return ret; @@ -111,11 +133,15 @@ struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess, { struct ksmbd_tree_connect *tcon; + read_lock(&sess->tree_conns_lock); tcon = xa_load(&sess->tree_conns, id); if (tcon) { - if (test_bit(TREE_CONN_EXPIRE, &tcon->status)) + if (tcon->t_state != TREE_CONNECTED) + tcon = NULL; + else if (!atomic_inc_not_zero(&tcon->refcount)) tcon = NULL; } + read_unlock(&sess->tree_conns_lock); return tcon; } @@ -129,8 +155,18 @@ int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess) if (!sess) return -EINVAL; - xa_for_each(&sess->tree_conns, id, tc) + xa_for_each(&sess->tree_conns, id, tc) { + write_lock(&sess->tree_conns_lock); + if (tc->t_state == TREE_DISCONNECTED) { + write_unlock(&sess->tree_conns_lock); + ret = -ENOENT; + continue; + } + tc->t_state = TREE_DISCONNECTED; + write_unlock(&sess->tree_conns_lock); + ret |= ksmbd_tree_conn_disconnect(sess, tc); + } xa_destroy(&sess->tree_conns); return ret; } diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h index 562d647ad9fa..6377a70b811c 100644 --- a/fs/smb/server/mgmt/tree_connect.h +++ b/fs/smb/server/mgmt/tree_connect.h @@ -14,7 +14,11 @@ struct ksmbd_share_config; struct ksmbd_user; struct ksmbd_conn; -#define TREE_CONN_EXPIRE 1 +enum { + TREE_NEW = 0, + TREE_CONNECTED, + TREE_DISCONNECTED +}; struct ksmbd_tree_connect { int id; @@ -27,7 +31,9 @@ struct ksmbd_tree_connect { int maximal_access; bool posix_extensions; - unsigned long status; + atomic_t refcount; + wait_queue_head_t refcount_q; + unsigned int t_state; }; struct ksmbd_tree_conn_status { @@ -46,6 +52,7 @@ struct ksmbd_session; struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, const char *share_name); +void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon); int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *tree_conn); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 8a5dcab05614..15f68ee05089 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -174,7 +174,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) unsigned long id; struct ksmbd_session *sess; - down_write(&sessions_table_lock); + down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { if (sess->state != SMB2_SESSION_VALID || time_after(jiffies, @@ -185,7 +185,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) continue; } } - up_write(&sessions_table_lock); + up_write(&conn->session_lock); } int ksmbd_session_register(struct ksmbd_conn *conn, @@ -227,7 +227,9 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) } } } + up_write(&sessions_table_lock); + down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { unsigned long chann_id; struct channel *chann; @@ -244,7 +246,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) ksmbd_session_destroy(sess); } } - up_write(&sessions_table_lock); + up_write(&conn->session_lock); } struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, @@ -252,9 +254,11 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, { struct ksmbd_session *sess; + down_read(&conn->session_lock); sess = xa_load(&conn->sessions, id); if (sess) sess->last_active = jiffies; + up_read(&conn->session_lock); return sess; } @@ -351,6 +355,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->ksmbd_chann_list); xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; + rwlock_init(&sess->tree_conns_lock); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index f99d475b28db..63cb08fffde8 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -60,6 +60,7 @@ struct ksmbd_session { struct ksmbd_file_table file_table; unsigned long last_active; + rwlock_t tree_conns_lock; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 5ab2f52f9b35..3079e607c5fe 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -115,8 +115,10 @@ static int __process_request(struct ksmbd_work *work, struct ksmbd_conn *conn, if (check_conn_state(work)) return SERVER_HANDLER_CONTINUE; - if (ksmbd_verify_smb_message(work)) + if (ksmbd_verify_smb_message(work)) { + conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); return SERVER_HANDLER_ABORT; + } command = conn->ops->get_cmd_val(work); *cmd = command; @@ -239,6 +241,8 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, } while (is_chained == true); send: + if (work->tcon) + ksmbd_tree_connect_put(work->tcon); smb3_preauth_hash_rsp(work); if (work->sess && work->sess->enc && work->encrypted && conn->ops->encrypt_resp) { diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index e881df1d10cb..23bd3d1209df 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -440,10 +440,8 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) validate_credit: if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && - smb2_validate_credit_charge(work->conn, hdr)) { - work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + smb2_validate_credit_charge(work->conn, hdr)) return 1; - } return 0; } diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 544022dd6d20..93262ca3f58a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -231,11 +231,12 @@ void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err) { struct smb2_hdr *rsp_hdr; - if (work->next_smb2_rcv_hdr_off) - rsp_hdr = ksmbd_resp_buf_next(work); - else - rsp_hdr = smb2_get_msg(work->response_buf); + rsp_hdr = smb2_get_msg(work->response_buf); rsp_hdr->Status = err; + + work->iov_idx = 0; + work->iov_cnt = 0; + work->next_smb2_rcv_hdr_off = 0; smb2_set_err_rsp(work); } @@ -1993,6 +1994,9 @@ int smb2_tree_connect(struct ksmbd_work *work) if (conn->posix_ext_supported) status.tree_conn->posix_extensions = true; + write_lock(&sess->tree_conns_lock); + status.tree_conn->t_state = TREE_CONNECTED; + write_unlock(&sess->tree_conns_lock); rsp->StructureSize = cpu_to_le16(16); out_err1: rsp->Capabilities = 0; @@ -2122,27 +2126,50 @@ int smb2_tree_disconnect(struct ksmbd_work *work) ksmbd_debug(SMB, "request\n"); + if (!tcon) { + ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); + + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + err = -ENOENT; + goto err_out; + } + + ksmbd_close_tree_conn_fds(work); + + write_lock(&sess->tree_conns_lock); + if (tcon->t_state == TREE_DISCONNECTED) { + write_unlock(&sess->tree_conns_lock); + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + err = -ENOENT; + goto err_out; + } + + WARN_ON_ONCE(atomic_dec_and_test(&tcon->refcount)); + tcon->t_state = TREE_DISCONNECTED; + write_unlock(&sess->tree_conns_lock); + + err = ksmbd_tree_conn_disconnect(sess, tcon); + if (err) { + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + goto err_out; + } + + work->tcon = NULL; + rsp->StructureSize = cpu_to_le16(4); err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_tree_disconnect_rsp)); if (err) { rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; - smb2_set_err_rsp(work); - return err; + goto err_out; } - if (!tcon || test_and_set_bit(TREE_CONN_EXPIRE, &tcon->status)) { - ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); + return 0; - rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; - smb2_set_err_rsp(work); - return -ENOENT; - } +err_out: + smb2_set_err_rsp(work); + return err; - ksmbd_close_tree_conn_fds(work); - ksmbd_tree_conn_disconnect(sess, tcon); - work->tcon = NULL; - return 0; } /** @@ -2164,17 +2191,17 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_debug(SMB, "request\n"); - sess_id = le64_to_cpu(req->hdr.SessionId); - - rsp->StructureSize = cpu_to_le16(4); - err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp)); - if (err) { - rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + ksmbd_conn_lock(conn); + if (!ksmbd_conn_good(conn)) { + ksmbd_conn_unlock(conn); + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); - return err; + return -ENOENT; } - + sess_id = le64_to_cpu(req->hdr.SessionId); ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_RECONNECT); + ksmbd_conn_unlock(conn); + ksmbd_close_session_fds(work); ksmbd_conn_wait_idle(conn, sess_id); @@ -2196,6 +2223,14 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_free_user(sess->user); sess->user = NULL; ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE); + + rsp->StructureSize = cpu_to_le16(4); + err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp)); + if (err) { + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + smb2_set_err_rsp(work); + return err; + } return 0; } @@ -3370,8 +3405,10 @@ err_out: } ksmbd_revert_fsids(work); err_out1: - if (!rc) + if (!rc) { + ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED); rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len); + } if (rc) { if (rc == -EINVAL) rsp->hdr.Status = STATUS_INVALID_PARAMETER; @@ -6115,12 +6152,12 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) memcpy(aux_payload_buf, rpc_resp->payload, rpc_resp->payload_sz); nbytes = rpc_resp->payload_sz; - kvfree(rpc_resp); err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); if (err) goto out; + kvfree(rpc_resp); } else { err = ksmbd_iov_pin_rsp(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer)); @@ -7028,10 +7065,6 @@ skip: ksmbd_debug(SMB, "would have to wait for getting lock\n"); - spin_lock(&work->conn->llist_lock); - list_add_tail(&smb_lock->clist, - &work->conn->lock_list); - spin_unlock(&work->conn->llist_lock); list_add(&smb_lock->llist, &rollback_list); argv = kmalloc(sizeof(void *), GFP_KERNEL); @@ -7062,9 +7095,6 @@ skip: if (work->state != KSMBD_WORK_ACTIVE) { list_del(&smb_lock->llist); - spin_lock(&work->conn->llist_lock); - list_del(&smb_lock->clist); - spin_unlock(&work->conn->llist_lock); locks_free_lock(flock); if (work->state == KSMBD_WORK_CANCELLED) { @@ -7084,19 +7114,16 @@ skip: } list_del(&smb_lock->llist); - spin_lock(&work->conn->llist_lock); - list_del(&smb_lock->clist); - spin_unlock(&work->conn->llist_lock); release_async_work(work); goto retry; } else if (!rc) { + list_add(&smb_lock->llist, &rollback_list); spin_lock(&work->conn->llist_lock); list_add_tail(&smb_lock->clist, &work->conn->lock_list); list_add_tail(&smb_lock->flist, &fp->lock_list); spin_unlock(&work->conn->llist_lock); - list_add(&smb_lock->llist, &rollback_list); ksmbd_debug(SMB, "successful in taking lock\n"); } else { goto out; @@ -8036,10 +8063,10 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) goto err_out; } - opinfo_put(opinfo); - ksmbd_fd_put(work, fp); opinfo->op_state = OPLOCK_STATE_NONE; wake_up_interruptible_all(&opinfo->oplock_q); + opinfo_put(opinfo); + ksmbd_fd_put(work, fp); rsp->StructureSize = cpu_to_le16(24); rsp->OplockLevel = rsp_oplevel; diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index f41f8d6108ce..c91eac6514dd 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -106,7 +106,7 @@ int ksmbd_query_inode_status(struct inode *inode) ci = __ksmbd_inode_lookup(inode); if (ci) { ret = KSMBD_INODE_STATUS_OK; - if (ci->m_flags & S_DEL_PENDING) + if (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS)) ret = KSMBD_INODE_STATUS_PENDING_DELETE; atomic_dec(&ci->m_count); } @@ -116,7 +116,7 @@ int ksmbd_query_inode_status(struct inode *inode) bool ksmbd_inode_pending_delete(struct ksmbd_file *fp) { - return (fp->f_ci->m_flags & S_DEL_PENDING); + return (fp->f_ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS)); } void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp) @@ -333,6 +333,9 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) static struct ksmbd_file *ksmbd_fp_get(struct ksmbd_file *fp) { + if (fp->f_state != FP_INITED) + return NULL; + if (!atomic_inc_not_zero(&fp->refcount)) return NULL; return fp; @@ -382,15 +385,20 @@ int ksmbd_close_fd(struct ksmbd_work *work, u64 id) return 0; ft = &work->sess->file_table; - read_lock(&ft->lock); + write_lock(&ft->lock); fp = idr_find(ft->idr, id); if (fp) { set_close_state_blocked_works(fp); - if (!atomic_dec_and_test(&fp->refcount)) + if (fp->f_state != FP_INITED) fp = NULL; + else { + fp->f_state = FP_CLOSED; + if (!atomic_dec_and_test(&fp->refcount)) + fp = NULL; + } } - read_unlock(&ft->lock); + write_unlock(&ft->lock); if (!fp) return -EINVAL; @@ -570,6 +578,7 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) fp->tcon = work->tcon; fp->volatile_id = KSMBD_NO_FID; fp->persistent_id = KSMBD_NO_FID; + fp->f_state = FP_NEW; fp->f_ci = ksmbd_inode_get(fp); if (!fp->f_ci) { @@ -591,6 +600,17 @@ err_out: return ERR_PTR(ret); } +void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state) +{ + if (!fp) + return; + + write_lock(&ft->lock); + fp->f_state = state; + write_unlock(&ft->lock); +} + static int __close_file_table_ids(struct ksmbd_file_table *ft, struct ksmbd_tree_connect *tcon, diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index fcb13413fa8d..03d0bf941216 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -60,6 +60,12 @@ struct ksmbd_inode { __le32 m_fattr; }; +enum { + FP_NEW = 0, + FP_INITED, + FP_CLOSED +}; + struct ksmbd_file { struct file *filp; u64 persistent_id; @@ -98,6 +104,7 @@ struct ksmbd_file { /* if ls is happening on directory, below is valid*/ struct ksmbd_readdir_data readdir_data; int dot_dotdot[2]; + unsigned int f_state; }; static inline void set_ctx_actor(struct dir_context *ctx, @@ -142,6 +149,8 @@ int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode); int ksmbd_init_global_file_table(void); void ksmbd_free_global_file_table(void); void ksmbd_set_fd_limit(unsigned long limit); +void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state); /* * INODE hash diff --git a/fs/stat.c b/fs/stat.c index 6e60389d6a15..d43a5cc1bfa4 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -27,37 +27,6 @@ #include "mount.h" /** - * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED - * @stat: where to store the resulting values - * @request_mask: STATX_* values requested - * @inode: inode from which to grab the c/mtime - * - * Given @inode, grab the ctime and mtime out if it and store the result - * in @stat. When fetching the value, flag it as queried so the next write - * will use a fine-grained timestamp. - */ -void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) -{ - atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec; - - /* If neither time was requested, then don't report them */ - if (!(request_mask & (STATX_CTIME|STATX_MTIME))) { - stat->result_mask &= ~(STATX_CTIME|STATX_MTIME); - return; - } - - stat->mtime = inode->i_mtime; - stat->ctime.tv_sec = inode->__i_ctime.tv_sec; - /* - * Atomically set the QUERIED flag and fetch the new value with - * the flag masked off. - */ - stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec) & - ~I_CTIME_QUERIED; -} -EXPORT_SYMBOL(fill_mg_cmtime); - -/** * generic_fillattr - Fill in the basic attributes from the inode struct * @idmap: idmap of the mount the inode was found from * @request_mask: statx request_mask @@ -89,14 +58,8 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode->i_atime; - - if (is_mgtime(inode)) { - fill_mg_cmtime(stat, request_mask, inode); - } else { - stat->mtime = inode->i_mtime; - stat->ctime = inode_get_ctime(inode); - } - + stat->mtime = inode->i_mtime; + stat->ctime = inode_get_ctime(inode); stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 9f64e7332796..8c8d64e76103 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -70,6 +70,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); static int dcache_dir_open_wrapper(struct inode *inode, struct file *file); +static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx); static int eventfs_release(struct inode *inode, struct file *file); static const struct inode_operations eventfs_root_dir_inode_operations = { @@ -79,7 +80,7 @@ static const struct inode_operations eventfs_root_dir_inode_operations = { static const struct file_operations eventfs_file_operations = { .open = dcache_dir_open_wrapper, .read = generic_read_dir, - .iterate_shared = dcache_readdir, + .iterate_shared = dcache_readdir_wrapper, .llseek = generic_file_llseek, .release = eventfs_release, }; @@ -396,6 +397,11 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, return ret; } +struct dentry_list { + void *cursor; + struct dentry **dentries; +}; + /** * eventfs_release - called to release eventfs file/dir * @inode: inode to be released @@ -404,26 +410,25 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, static int eventfs_release(struct inode *inode, struct file *file) { struct tracefs_inode *ti; - struct eventfs_inode *ei; - struct eventfs_file *ef; - struct dentry *dentry; - int idx; + struct dentry_list *dlist = file->private_data; + void *cursor; + int i; ti = get_tracefs(inode); if (!(ti->flags & TRACEFS_EVENT_INODE)) return -EINVAL; - ei = ti->private; - idx = srcu_read_lock(&eventfs_srcu); - list_for_each_entry_srcu(ef, &ei->e_top_files, list, - srcu_read_lock_held(&eventfs_srcu)) { - mutex_lock(&eventfs_mutex); - dentry = ef->dentry; - mutex_unlock(&eventfs_mutex); - if (dentry) - dput(dentry); + if (WARN_ON_ONCE(!dlist)) + return -EINVAL; + + for (i = 0; dlist->dentries && dlist->dentries[i]; i++) { + dput(dlist->dentries[i]); } - srcu_read_unlock(&eventfs_srcu, idx); + + cursor = dlist->cursor; + kfree(dlist->dentries); + kfree(dlist); + file->private_data = cursor; return dcache_dir_close(inode, file); } @@ -442,22 +447,70 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) struct tracefs_inode *ti; struct eventfs_inode *ei; struct eventfs_file *ef; + struct dentry_list *dlist; + struct dentry **dentries = NULL; struct dentry *dentry = file_dentry(file); + struct dentry *d; struct inode *f_inode = file_inode(file); + int cnt = 0; int idx; + int ret; ti = get_tracefs(f_inode); if (!(ti->flags & TRACEFS_EVENT_INODE)) return -EINVAL; + if (WARN_ON_ONCE(file->private_data)) + return -EINVAL; + + dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); + if (!dlist) + return -ENOMEM; + ei = ti->private; idx = srcu_read_lock(&eventfs_srcu); list_for_each_entry_srcu(ef, &ei->e_top_files, list, srcu_read_lock_held(&eventfs_srcu)) { - create_dentry(ef, dentry, false); + d = create_dentry(ef, dentry, false); + if (d) { + struct dentry **tmp; + + tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL); + if (!tmp) + break; + tmp[cnt] = d; + tmp[cnt + 1] = NULL; + cnt++; + dentries = tmp; + } } srcu_read_unlock(&eventfs_srcu, idx); - return dcache_dir_open(inode, file); + ret = dcache_dir_open(inode, file); + + /* + * dcache_dir_open() sets file->private_data to a dentry cursor. + * Need to save that but also save all the dentries that were + * opened by this function. + */ + dlist->cursor = file->private_data; + dlist->dentries = dentries; + file->private_data = dlist; + return ret; +} + +/* + * This just sets the file->private_data back to the cursor and back. + */ +static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) +{ + struct dentry_list *dlist = file->private_data; + int ret; + + file->private_data = dlist->cursor; + ret = dcache_readdir(file, ctx); + dlist->cursor = file->private_data; + file->private_data = dlist; + return ret; } /** diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index c9d653168ad0..ed0bc8cbc703 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y depends on XFS_ONLINE_SCRUB - select FS_DEBUG + select XFS_DEBUG help If you say Y here, the kernel will gather usage data about the online metadata check subsystem. This includes the number diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index e9cc481b4ddf..f9f4d694640d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -1001,6 +1001,12 @@ xfs_ag_shrink_space( error = -ENOSPC; goto resv_init_out; } + + /* Update perag geometry */ + pag->block_count -= delta; + __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, + &pag->agino_max); + xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); return 0; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 2420865f3007..a5100a11faf9 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -131,4 +131,26 @@ void xlog_check_buf_cancel_table(struct xlog *log); #define xlog_check_buf_cancel_table(log) do { } while (0) #endif +/* + * Transform a regular reservation into one suitable for recovery of a log + * intent item. + * + * Intent recovery only runs a single step of the transaction chain and defers + * the rest to a separate transaction. Therefore, we reduce logcount to 1 here + * to avoid livelocks if the log grant space is nearly exhausted due to the + * recovered intent pinning the tail. Keep the same logflags to avoid tripping + * asserts elsewhere. Struct copies abound below. + */ +static inline struct xfs_trans_res +xlog_recover_resv(const struct xfs_trans_res *r) +{ + struct xfs_trans_res ret = { + .tr_logres = r->tr_logres, + .tr_logcount = 1, + .tr_logflags = r->tr_logflags, + }; + + return ret; +} + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 5e174685a77c..6264daaab37b 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -266,7 +266,8 @@ xfs_validate_sb_write( return -EFSCORRUPTED; } - if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + if (!xfs_is_readonly(mp) && + xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, "Corruption detected in superblock read-only compatible features (0x%x)!", (sbp->sb_features_ro_compat & diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index ad22656376d3..6b2296ff248a 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - /* If the mtime changes, then ctime must also change */ - ASSERT(flags & XFS_ICHGTIME_CHG); + tv = current_time(inode); - tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode->i_mtime = tv; + if (flags & XFS_ICHGTIME_CHG) + inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 7d3aa14d81b5..4849efcaa33a 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -588,6 +588,8 @@ out_nofix: out_teardown: error = xchk_teardown(sc, error); out_sc: + if (error != -ENOENT) + xchk_stats_merge(mp, sm, &run); kfree(sc); out: trace_xchk_done(XFS_I(file_inode(file)), sm, error); @@ -595,8 +597,6 @@ out: sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; error = 0; } - if (error != -ENOENT) - xchk_stats_merge(mp, sm, &run); return error; need_drain: error = xchk_teardown(sc, 0); diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index aeb92624176b..cd91db4a5548 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -185,7 +185,10 @@ xchk_stats_merge_one( { struct xchk_scrub_stats *css; - ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR); + if (sm->sm_type >= XFS_SCRUB_TYPE_NR) { + ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR); + return; + } css = &cs->cs_stats[sm->sm_type]; spin_lock(&css->css_lock); diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index d98e8e77c684..090c3ead43fd 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -10,7 +10,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_format.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/scrub.h" diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 5db87b34fb6e..89c7a9f4f930 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -333,7 +333,6 @@ xfs_attr_inactive( int error = 0; mp = dp->i_mount; - ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); xfs_ilock(dp, lock_mode); if (!xfs_inode_has_attr_fork(dp)) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 2788a6f2edcd..36fe2abb16e6 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -547,7 +547,7 @@ xfs_attri_item_recover( struct xfs_inode *ip; struct xfs_da_args *args; struct xfs_trans *tp; - struct xfs_trans_res tres; + struct xfs_trans_res resv; struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; @@ -618,8 +618,9 @@ xfs_attri_item_recover( goto out; } - xfs_init_attr_trans(args, &tres, &total); - error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp); + xfs_init_attr_trans(args, &resv, &total); + resv = xlog_recover_resv(&resv); + error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) goto out; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 7551c3ec4ea5..e736a0844c89 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -490,6 +490,7 @@ xfs_bui_item_recover( struct list_head *capture_list) { struct xfs_bmap_intent fake = { }; + struct xfs_trans_res resv; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; @@ -515,7 +516,8 @@ xfs_bui_item_recover( return error; /* Allocate transaction and do the work. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) goto err_rele; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index afc4c78b9eed..d5787991bb5b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2010, 2023 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" @@ -19,21 +19,147 @@ #include "xfs_log.h" #include "xfs_ag.h" -STATIC int -xfs_trim_extents( +/* + * Notes on an efficient, low latency fstrim algorithm + * + * We need to walk the filesystem free space and issue discards on the free + * space that meet the search criteria (size and location). We cannot issue + * discards on extents that might be in use, or are so recently in use they are + * still marked as busy. To serialise against extent state changes whilst we are + * gathering extents to trim, we must hold the AGF lock to lock out other + * allocations and extent free operations that might change extent state. + * + * However, we cannot just hold the AGF for the entire AG free space walk whilst + * we issue discards on each free space that is found. Storage devices can have + * extremely slow discard implementations (e.g. ceph RBD) and so walking a + * couple of million free extents and issuing synchronous discards on each + * extent can take a *long* time. Whilst we are doing this walk, nothing else + * can access the AGF, and we can stall transactions and hence the log whilst + * modifications wait for the AGF lock to be released. This can lead hung tasks + * kicking the hung task timer and rebooting the system. This is bad. + * + * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI + * lock, gathers a range of inode cluster buffers that are allocated, drops the + * AGI lock and then reads all the inode cluster buffers and processes them. It + * loops doing this, using a cursor to keep track of where it is up to in the AG + * for each iteration to restart the INOBT lookup from. + * + * We can't do this exactly with free space - once we drop the AGF lock, the + * state of the free extent is out of our control and we cannot run a discard + * safely on it in this situation. Unless, of course, we've marked the free + * extent as busy and undergoing a discard operation whilst we held the AGF + * locked. + * + * This is exactly how online discard works - free extents are marked busy when + * they are freed, and once the extent free has been committed to the journal, + * the busy extent record is marked as "undergoing discard" and the discard is + * then issued on the free extent. Once the discard completes, the busy extent + * record is removed and the extent is able to be allocated again. + * + * In the context of fstrim, if we find a free extent we need to discard, we + * don't have to discard it immediately. All we need to do it record that free + * extent as being busy and under discard, and all the allocation routines will + * now avoid trying to allocate it. Hence if we mark the extent as busy under + * the AGF lock, we can safely discard it without holding the AGF lock because + * nothing will attempt to allocate that free space until the discard completes. + * + * This also allows us to issue discards asynchronously like we do with online + * discard, and so for fast devices fstrim will run much faster as we can have + * multiple discard operations in flight at once, as well as pipeline the free + * extent search so that it overlaps in flight discard IO. + */ + +struct workqueue_struct *xfs_discard_wq; + +static void +xfs_discard_endio_work( + struct work_struct *work) +{ + struct xfs_busy_extents *extents = + container_of(work, struct xfs_busy_extents, endio_work); + + xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); + kmem_free(extents->owner); +} + +/* + * Queue up the actual completion to a thread to avoid IRQ-safe locking for + * pagb_lock. + */ +static void +xfs_discard_endio( + struct bio *bio) +{ + struct xfs_busy_extents *extents = bio->bi_private; + + INIT_WORK(&extents->endio_work, xfs_discard_endio_work); + queue_work(xfs_discard_wq, &extents->endio_work); + bio_put(bio); +} + +/* + * Walk the discard list and issue discards on all the busy extents in the + * list. We plug and chain the bios so that we only need a single completion + * call to clear all the busy extents once the discards are complete. + */ +int +xfs_discard_extents( + struct xfs_mount *mp, + struct xfs_busy_extents *extents) +{ + struct xfs_extent_busy *busyp; + struct bio *bio = NULL; + struct blk_plug plug; + int error = 0; + + blk_start_plug(&plug); + list_for_each_entry(busyp, &extents->extent_list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, &bio); + if (error && error != -EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llx,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + break; + } + } + + if (bio) { + bio->bi_private = extents; + bio->bi_end_io = xfs_discard_endio; + submit_bio(bio); + } else { + xfs_discard_endio_work(&extents->endio_work); + } + blk_finish_plug(&plug); + + return error; +} + + +static int +xfs_trim_gather_extents( struct xfs_perag *pag, xfs_daddr_t start, xfs_daddr_t end, xfs_daddr_t minlen, + struct xfs_alloc_rec_incore *tcur, + struct xfs_busy_extents *extents, uint64_t *blocks_trimmed) { struct xfs_mount *mp = pag->pag_mount; - struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - struct xfs_agf *agf; int error; int i; + int batch = 100; /* * Force out the log. This means any transactions that might have freed @@ -45,20 +171,28 @@ xfs_trim_extents( error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) return error; - agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); /* - * Look up the longest btree in the AGF and start with it. + * Look up the extent length requested in the AGF and start with it. */ - error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); + if (tcur->ar_startblock == NULLAGBLOCK) + error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); + else + error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, + tcur->ar_blockcount, &i); if (error) goto out_del_cursor; + if (i == 0) { + /* nothing of that length left in the AG, we are done */ + tcur->ar_blockcount = 0; + goto out_del_cursor; + } /* * Loop until we are done with all extents that are large - * enough to be worth discarding. + * enough to be worth discarding or we hit batch limits. */ while (i) { xfs_agblock_t fbno; @@ -73,7 +207,16 @@ xfs_trim_extents( error = -EFSCORRUPTED; break; } - ASSERT(flen <= be32_to_cpu(agf->agf_longest)); + + if (--batch <= 0) { + /* + * Update the cursor to point at this extent so we + * restart the next batch from this extent. + */ + tcur->ar_startblock = fbno; + tcur->ar_blockcount = flen; + break; + } /* * use daddr format for all range/len calculations as that is @@ -88,6 +231,7 @@ xfs_trim_extents( */ if (dlen < minlen) { trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + tcur->ar_blockcount = 0; break; } @@ -110,29 +254,103 @@ xfs_trim_extents( goto next_extent; } - trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen); - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); - if (error) - break; + xfs_extent_busy_insert_discard(pag, fbno, flen, + &extents->extent_list); *blocks_trimmed += flen; - next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) break; - if (fatal_signal_pending(current)) { - error = -ERESTARTSYS; - break; - } + /* + * If there's no more records in the tree, we are done. Set the + * cursor block count to 0 to indicate to the caller that there + * is no more extents to search. + */ + if (i == 0) + tcur->ar_blockcount = 0; } + /* + * If there was an error, release all the gathered busy extents because + * we aren't going to issue a discard on them any more. + */ + if (error) + xfs_extent_busy_clear(mp, &extents->extent_list, false); out_del_cursor: xfs_btree_del_cursor(cur, error); xfs_buf_relse(agbp); return error; } +static bool +xfs_trim_should_stop(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + +/* + * Iterate the free list gathering extents and discarding them. We need a cursor + * for the repeated iteration of gather/discard loop, so use the longest extent + * we found in the last batch as the key to start the next. + */ +static int +xfs_trim_extents( + struct xfs_perag *pag, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + uint64_t *blocks_trimmed) +{ + struct xfs_alloc_rec_incore tcur = { + .ar_blockcount = pag->pagf_longest, + .ar_startblock = NULLAGBLOCK, + }; + int error = 0; + + do { + struct xfs_busy_extents *extents; + + extents = kzalloc(sizeof(*extents), GFP_KERNEL); + if (!extents) { + error = -ENOMEM; + break; + } + + extents->mount = pag->pag_mount; + extents->owner = extents; + INIT_LIST_HEAD(&extents->extent_list); + + error = xfs_trim_gather_extents(pag, start, end, minlen, + &tcur, extents, blocks_trimmed); + if (error) { + kfree(extents); + break; + } + + /* + * We hand the extent list to the discard function here so the + * discarded extents can be removed from the busy extent list. + * This allows the discards to run asynchronously with gathering + * the next round of extents to discard. + * + * However, we must ensure that we do not reference the extent + * list after this function call, as it may have been freed by + * the time control returns to us. + */ + error = xfs_discard_extents(pag->pag_mount, extents); + if (error) + break; + + if (xfs_trim_should_stop()) + break; + + } while (tcur.ar_blockcount != 0); + + return error; + +} + /* * trim a range of the filesystem. * @@ -195,12 +413,12 @@ xfs_ioc_trim( for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { error = xfs_trim_extents(pag, start, end, minlen, &blocks_trimmed); - if (error) { + if (error) last_error = error; - if (error == -ERESTARTSYS) { - xfs_perag_rele(pag); - break; - } + + if (xfs_trim_should_stop()) { + xfs_perag_rele(pag); + break; } } diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h index de92d9cc958f..2b1a85223a56 100644 --- a/fs/xfs/xfs_discard.h +++ b/fs/xfs/xfs_discard.h @@ -3,8 +3,10 @@ #define XFS_DISCARD_H 1 struct fstrim_range; -struct list_head; +struct xfs_mount; +struct xfs_busy_extents; -extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy); +int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim); #endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1064c2342876..7cd09c3a82cb 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -146,6 +146,20 @@ xfs_nfs_get_inode( return ERR_PTR(error); } + /* + * Reload the incore unlinked list to avoid failure in inodegc. + * Use an unlocked check here because unrecovered unlinked inodes + * should be somewhat rare. + */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked(ip); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_irele(ip); + return ERR_PTR(error); + } + } + if (VFS_I(ip)->i_generation != generation) { xfs_irele(ip); return ERR_PTR(-ESTALE); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 7c2fdc71e42d..9ecfdcdc752f 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -19,13 +19,13 @@ #include "xfs_log.h" #include "xfs_ag.h" -void -xfs_extent_busy_insert( - struct xfs_trans *tp, +static void +xfs_extent_busy_insert_list( struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, - unsigned int flags) + unsigned int flags, + struct list_head *busy_list) { struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; @@ -40,7 +40,7 @@ xfs_extent_busy_insert( new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len); + trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; @@ -62,10 +62,33 @@ xfs_extent_busy_insert( rb_link_node(&new->rb_node, parent, rbp); rb_insert_color(&new->rb_node, &pag->pagb_tree); - list_add(&new->list, &tp->t_busy); + /* always process discard lists in fifo order */ + list_add_tail(&new->list, busy_list); spin_unlock(&pag->pagb_lock); } +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); +} + +void +xfs_extent_busy_insert_discard( + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + struct list_head *busy_list) +{ + xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, + busy_list); +} + /* * Search for a busy extent within the range of the extent we are about to * allocate. You need to be holding the busy extent tree lock when calling diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index c37bf87e6781..0639aab336f3 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -16,9 +16,6 @@ struct xfs_alloc_arg; /* * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that * have been freed but whose transactions aren't committed to disk yet. - * - * Note that we use the transaction ID to record the transaction, not the - * transaction structure itself. See xfs_extent_busy_insert() for details. */ struct xfs_extent_busy { struct rb_node rb_node; /* ag by-bno indexed search tree */ @@ -31,11 +28,32 @@ struct xfs_extent_busy { #define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ }; +/* + * List used to track groups of related busy extents all the way through + * to discard completion. + */ +struct xfs_busy_extents { + struct xfs_mount *mount; + struct list_head extent_list; + struct work_struct endio_work; + + /* + * Owner is the object containing the struct xfs_busy_extents to free + * once the busy extents have been processed. If only the + * xfs_busy_extents object needs freeing, then point this at itself. + */ + void *owner; +}; + void xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); void +xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno, + xfs_extlen_t len, struct list_head *busy_list); + +void xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, bool do_discard); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index f1a5ecf099aa..3fa8789820ad 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -660,6 +660,7 @@ xfs_efi_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; @@ -683,7 +684,8 @@ xfs_efi_item_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp); if (error) return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 10403ba9b58f..736e5545f584 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -565,6 +565,19 @@ err: } #endif /* CONFIG_XFS_RT */ +static inline bool +rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r) +{ + if (!xfs_has_reflink(mp)) + return true; + if (XFS_RMAP_NON_INODE_OWNER(r->rm_owner)) + return true; + if (r->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return true; + return false; +} + /* Execute a getfsmap query against the regular data device. */ STATIC int __xfs_getfsmap_datadev( @@ -598,7 +611,6 @@ __xfs_getfsmap_datadev( * low to the fsmap low key and max out the high key to the end * of the AG. */ - info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); if (error) @@ -608,12 +620,9 @@ __xfs_getfsmap_datadev( /* Adjust the low key if we are continuing from where we left off. */ if (info->low.rm_blockcount == 0) { - /* empty */ - } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) || - (info->low.rm_flags & (XFS_RMAP_ATTR_FORK | - XFS_RMAP_BMBT_BLOCK | - XFS_RMAP_UNWRITTEN))) { - info->low.rm_startblock += info->low.rm_blockcount; + /* No previous record from which to continue */ + } else if (rmap_not_shareable(mp, &info->low)) { + /* Last record seen was an unshareable extent */ info->low.rm_owner = 0; info->low.rm_offset = 0; @@ -621,8 +630,10 @@ __xfs_getfsmap_datadev( if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs) return 0; } else { + /* Last record seen was a shareable file data extent */ info->low.rm_offset += info->low.rm_blockcount; } + info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); info->high.rm_startblock = -1U; info->high.rm_owner = ULLONG_MAX; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e541f5c0bc25..3c210ac83713 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -113,7 +113,7 @@ xfs_inode_alloc( INIT_LIST_HEAD(&ip->i_ioend_list); spin_lock_init(&ip->i_ioend_lock); ip->i_next_unlinked = NULLAGINO; - ip->i_prev_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; return ip; } @@ -443,7 +443,7 @@ xfs_inodegc_queue_all( int cpu; bool ret = false; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) { mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); @@ -463,7 +463,7 @@ xfs_inodegc_wait_all( int error = 0; flush_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { struct xfs_inodegc *gc; gc = per_cpu_ptr(mp->m_inodegc, cpu); @@ -1845,9 +1845,17 @@ xfs_inodegc_worker( struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; + struct xfs_mount *mp = gc->mp; unsigned int nofs_flag; - ASSERT(gc->cpu == smp_processor_id()); + /* + * Clear the cpu mask bit and ensure that we have seen the latest + * update of the gc structure associated with this CPU. This matches + * with the release semantics used when setting the cpumask bit in + * xfs_inodegc_queue. + */ + cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask); + smp_mb__after_atomic(); WRITE_ONCE(gc->items, 0); @@ -1862,7 +1870,7 @@ xfs_inodegc_worker( nofs_flag = memalloc_nofs_save(); ip = llist_entry(node, struct xfs_inode, i_gclist); - trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); + trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits)); WRITE_ONCE(gc->shrinker_hits, 0); llist_for_each_entry_safe(ip, n, node, i_gclist) { @@ -2057,6 +2065,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned int cpu_nr; unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); @@ -2064,18 +2073,28 @@ xfs_inodegc_queue( ip->i_flags |= XFS_NEED_INACTIVE; spin_unlock(&ip->i_flags_lock); - gc = get_cpu_ptr(mp->m_inodegc); + cpu_nr = get_cpu(); + gc = this_cpu_ptr(mp->m_inodegc); llist_add(&ip->i_gclist, &gc->list); items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); /* + * Ensure the list add is always seen by anyone who finds the cpumask + * bit set. This effectively gives the cpumask bit set operation + * release ordering semantics. + */ + smp_mb__before_atomic(); + if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask)) + cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask); + + /* * We queue the work while holding the current CPU so that the work * is scheduled to run on this CPU. */ if (!xfs_is_inodegc_enabled(mp)) { - put_cpu_ptr(gc); + put_cpu(); return; } @@ -2085,7 +2104,7 @@ xfs_inodegc_queue( trace_xfs_inodegc_queue(mp, __return_address); mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, queue_delay); - put_cpu_ptr(gc); + put_cpu(); if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); @@ -2094,47 +2113,6 @@ xfs_inodegc_queue( } /* - * Fold the dead CPU inodegc queue into the current CPUs queue. - */ -void -xfs_inodegc_cpu_dead( - struct xfs_mount *mp, - unsigned int dead_cpu) -{ - struct xfs_inodegc *dead_gc, *gc; - struct llist_node *first, *last; - unsigned int count = 0; - - dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_delayed_work_sync(&dead_gc->work); - - if (llist_empty(&dead_gc->list)) - return; - - first = dead_gc->list.first; - last = first; - while (last->next) { - last = last->next; - count++; - } - dead_gc->list.first = NULL; - dead_gc->items = 0; - - /* Add pending work to current CPU */ - gc = get_cpu_ptr(mp->m_inodegc); - llist_add_batch(first, last, &gc->list); - count += READ_ONCE(gc->items); - WRITE_ONCE(gc->items, count); - - if (xfs_is_inodegc_enabled(mp)) { - trace_xfs_inodegc_queue(mp, __return_address); - mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, - 0); - } - put_cpu_ptr(gc); -} - -/* * We set the inode flag atomically with the radix tree tag. Once we get tag * lookups on the radix tree, this inode flag can go away. * @@ -2195,7 +2173,7 @@ xfs_inodegc_shrinker_count( if (!xfs_is_inodegc_enabled(mp)) return 0; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) return XFS_INODEGC_SHRINKER_COUNT; @@ -2220,7 +2198,7 @@ xfs_inodegc_shrinker_scan( trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) { unsigned int h = READ_ONCE(gc->shrinker_hits); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 2fa6f2e09d07..905944dafbe5 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -79,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp); int xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); -void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); int xfs_inodegc_register_shrinker(struct xfs_mount *mp); #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 360fe83a334f..4d55f58d99b7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1642,8 +1642,11 @@ xfs_inode_needs_inactive( if (VFS_I(ip)->i_mode == 0) return false; - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (xfs_is_readonly(mp)) + /* + * If this is a read-only mount, don't do this (would generate I/O) + * unless we're in log recovery and cleaning the iunlinked list. + */ + if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) return false; /* If the log isn't running, push inodes straight to reclaim. */ @@ -1703,8 +1706,11 @@ xfs_inactive( mp = ip->i_mount; ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (xfs_is_readonly(mp)) + /* + * If this is a read-only mount, don't do this (would generate I/O) + * unless we're in log recovery and cleaning the iunlinked list. + */ + if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) goto out; /* Metadata inodes require explicit resource cleanup. */ @@ -1736,9 +1742,21 @@ xfs_inactive( ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; - error = xfs_qm_dqattach(ip); - if (error) - goto out; + if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { + /* + * If this inode is being inactivated during a quotacheck and + * has not yet been scanned by quotacheck, we /must/ remove + * the dquots from the inode before inactivation changes the + * block and inode counts. Most probably this is a result of + * reloading the incore iunlinked list to purge unrecovered + * unlinked inodes. + */ + xfs_qm_dqdetach(ip); + } else { + error = xfs_qm_dqattach(ip); + if (error) + goto out; + } if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); @@ -1822,12 +1840,17 @@ xfs_iunlink_lookup( rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, agino); + if (!ip) { + /* Caller can handle inode not being in memory. */ + rcu_read_unlock(); + return NULL; + } /* - * Inode not in memory or in RCU freeing limbo should not happen. - * Warn about this and let the caller handle the failure. + * Inode in RCU freeing limbo should not happen. Warn about this and + * let the caller handle the failure. */ - if (WARN_ON_ONCE(!ip || !ip->i_ino)) { + if (WARN_ON_ONCE(!ip->i_ino)) { rcu_read_unlock(); return NULL; } @@ -1836,7 +1859,10 @@ xfs_iunlink_lookup( return ip; } -/* Update the prev pointer of the next agino. */ +/* + * Update the prev pointer of the next agino. Returns -ENOLINK if the inode + * is not in cache. + */ static int xfs_iunlink_update_backref( struct xfs_perag *pag, @@ -1851,7 +1877,8 @@ xfs_iunlink_update_backref( ip = xfs_iunlink_lookup(pag, next_agino); if (!ip) - return -EFSCORRUPTED; + return -ENOLINK; + ip->i_prev_unlinked = prev_agino; return 0; } @@ -1895,6 +1922,64 @@ xfs_iunlink_update_bucket( return 0; } +/* + * Load the inode @next_agino into the cache and set its prev_unlinked pointer + * to @prev_agino. Caller must hold the AGI to synchronize with other changes + * to the unlinked list. + */ +STATIC int +xfs_iunlink_reload_next( + struct xfs_trans *tp, + struct xfs_buf *agibp, + xfs_agino_t prev_agino, + xfs_agino_t next_agino) +{ + struct xfs_perag *pag = agibp->b_pag; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *next_ip = NULL; + xfs_ino_t ino; + int error; + + ASSERT(next_agino != NULLAGINO); + +#ifdef DEBUG + rcu_read_lock(); + next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); + ASSERT(next_ip == NULL); + rcu_read_unlock(); +#endif + + xfs_info_ratelimited(mp, + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", + next_agino, pag->pag_agno); + + /* + * Use an untrusted lookup just to be cautious in case the AGI has been + * corrupted and now points at a free inode. That shouldn't happen, + * but we'd rather shut down now since we're already running in a weird + * situation. + */ + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); + if (error) + return error; + + /* If this is not an unlinked inode, something is very wrong. */ + if (VFS_I(next_ip)->i_nlink != 0) { + error = -EFSCORRUPTED; + goto rele; + } + + next_ip->i_prev_unlinked = prev_agino; + trace_xfs_iunlink_reload_next(next_ip); +rele: + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); + if (xfs_is_quotacheck_running(mp) && next_ip) + xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); + xfs_irele(next_ip); + return error; +} + static int xfs_iunlink_insert_inode( struct xfs_trans *tp, @@ -1926,6 +2011,8 @@ xfs_iunlink_insert_inode( * inode. */ error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); if (error) return error; @@ -1941,6 +2028,7 @@ xfs_iunlink_insert_inode( } /* Point the head of the list to point to this inode. */ + ip->i_prev_unlinked = NULLAGINO; return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); } @@ -2020,6 +2108,9 @@ xfs_iunlink_remove_inode( */ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, ip->i_next_unlinked); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, + ip->i_next_unlinked); if (error) return error; @@ -2040,7 +2131,7 @@ xfs_iunlink_remove_inode( } ip->i_next_unlinked = NULLAGINO; - ip->i_prev_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; return error; } @@ -3529,3 +3620,117 @@ xfs_iunlock2_io_mmap( if (ip1 != ip2) inode_unlock(VFS_I(ip1)); } + +/* + * Reload the incore inode list for this inode. Caller should ensure that + * the link count cannot change, either by taking ILOCK_SHARED or otherwise + * preventing other threads from executing. + */ +int +xfs_inode_reload_unlinked_bucket( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agibp; + struct xfs_agi *agi; + struct xfs_perag *pag; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t prev_agino, next_agino; + unsigned int bucket; + bool foundit = false; + int error; + + /* Grab the first inode in the list */ + pag = xfs_perag_get(mp, agno); + error = xfs_ialloc_read_agi(pag, tp, &agibp); + xfs_perag_put(pag); + if (error) + return error; + + /* + * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the + * incore unlinked list pointers for this inode. Check once more to + * see if we raced with anyone else to reload the unlinked list. + */ + if (!xfs_inode_unlinked_incomplete(ip)) { + foundit = true; + goto out_agibp; + } + + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + agi = agibp->b_addr; + + trace_xfs_inode_reload_unlinked_bucket(ip); + + xfs_info_ratelimited(mp, + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", + agino, agno); + + prev_agino = NULLAGINO; + next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (next_agino != NULLAGINO) { + struct xfs_inode *next_ip = NULL; + + /* Found this caller's inode, set its backlink. */ + if (next_agino == agino) { + next_ip = ip; + next_ip->i_prev_unlinked = prev_agino; + foundit = true; + goto next_inode; + } + + /* Try in-memory lookup first. */ + next_ip = xfs_iunlink_lookup(pag, next_agino); + if (next_ip) + goto next_inode; + + /* Inode not in memory, try reloading it. */ + error = xfs_iunlink_reload_next(tp, agibp, prev_agino, + next_agino); + if (error) + break; + + /* Grab the reloaded inode. */ + next_ip = xfs_iunlink_lookup(pag, next_agino); + if (!next_ip) { + /* No incore inode at all? We reloaded it... */ + ASSERT(next_ip != NULL); + error = -EFSCORRUPTED; + break; + } + +next_inode: + prev_agino = next_agino; + next_agino = next_ip->i_next_unlinked; + } + +out_agibp: + xfs_trans_brelse(tp, agibp); + /* Should have found this inode somewhere in the iunlinked bucket. */ + if (!error && !foundit) + error = -EFSCORRUPTED; + return error; +} + +/* Decide if this inode is missing its unlinked list and reload it. */ +int +xfs_inode_reload_unlinked( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_inode_unlinked_incomplete(ip)) + error = xfs_inode_reload_unlinked_bucket(tp, ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_trans_cancel(tp); + + return error; +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7547caf2f2ab..0c5bdb91152e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -68,8 +68,21 @@ typedef struct xfs_inode { uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ - /* unlinked list pointers */ + /* + * Unlinked list pointers. These point to the next and previous inodes + * in the AGI unlinked bucket list, respectively. These fields can + * only be updated with the AGI locked. + * + * i_next_unlinked caches di_next_unlinked. + */ xfs_agino_t i_next_unlinked; + + /* + * If the inode is not on an unlinked list, this field is zero. If the + * inode is the first element in an unlinked list, this field is + * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode + * in the unlinked list. + */ xfs_agino_t i_prev_unlinked; /* VFS inode */ @@ -81,6 +94,11 @@ typedef struct xfs_inode { struct list_head i_ioend_list; } xfs_inode_t; +static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) +{ + return ip->i_prev_unlinked != 0; +} + static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) { return ip->i_forkoff > 0; @@ -326,6 +344,9 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) */ #define XFS_INACTIVATING (1 << 13) +/* Quotacheck is running but inode has not been added to quota counts. */ +#define XFS_IQUOTAUNCHECKED (1 << 14) + /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ @@ -340,7 +361,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ - XFS_INACTIVATING) + XFS_INACTIVATING | XFS_IQUOTAUNCHECKED) /* * Flags for inode locking. @@ -575,4 +596,13 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +static inline bool +xfs_inode_unlinked_incomplete( + struct xfs_inode *ip) +{ + return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); +} +int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); +int xfs_inode_reload_unlinked(struct xfs_inode *ip); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 2ededd3f6b8c..2b3b05c28e9e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -573,10 +573,10 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode_get_ctime(inode); stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); - fill_mg_cmtime(stat, request_mask, inode); - if (xfs_has_v3inodes(mp)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; @@ -584,6 +584,11 @@ xfs_vn_getattr( } } + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->change_cookie = inode_query_iversion(inode); + stat->result_mask |= STATX_CHANGE_COOKIE; + } + /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. @@ -917,7 +922,7 @@ xfs_setattr_size( if (newsize != oldsize && !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = - current_mgtime(inode); + current_time(inode); iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c2093cb56092..f5377ba5967a 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -80,6 +80,17 @@ xfs_bulkstat_one_int( if (error) goto out; + /* Reload the incore unlinked list to avoid failure in inodegc. */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked_bucket(tp, ip); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_irele(ip); + return error; + } + } + ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 79004d193e54..51c100c86177 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -715,15 +715,7 @@ xfs_log_mount( * just worked. */ if (!xfs_has_norecovery(mp)) { - /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, - &mp->m_opstate); error = xlog_recover(log); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); @@ -772,7 +764,6 @@ xfs_log_mount_finish( struct xfs_mount *mp) { struct xlog *log = mp->m_log; - bool readonly; int error = 0; if (xfs_has_norecovery(mp)) { @@ -781,12 +772,6 @@ xfs_log_mount_finish( } /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - - /* * During the second phase of log recovery, we need iget and * iput to behave like they do for an active filesystem. * xfs_fs_drop_inode needs to be able to prevent the deletion @@ -835,8 +820,6 @@ xfs_log_mount_finish( xfs_buftarg_drain(mp->m_ddev_targp); clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* Make sure the log is dead if we're returning failure. */ ASSERT(!error || xlog_is_shutdown(log)); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index eccbfb99e894..67a99d94701e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -16,8 +16,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_trace.h" - -struct workqueue_struct *xfs_discard_wq; +#include "xfs_discard.h" /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to @@ -103,7 +102,7 @@ xlog_cil_ctx_alloc(void) ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); + INIT_LIST_HEAD(&ctx->busy_extents.extent_list); INIT_LIST_HEAD(&ctx->log_items); INIT_LIST_HEAD(&ctx->lv_chain); INIT_WORK(&ctx->push_work, xlog_cil_push_work); @@ -124,7 +123,7 @@ xlog_cil_push_pcp_aggregate( struct xlog_cil_pcp *cilpcp; int cpu; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &ctx->cil_pcpmask) { cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); ctx->ticket->t_curr_res += cilpcp->space_reserved; @@ -132,7 +131,7 @@ xlog_cil_push_pcp_aggregate( if (!list_empty(&cilpcp->busy_extents)) { list_splice_init(&cilpcp->busy_extents, - &ctx->busy_extents); + &ctx->busy_extents.extent_list); } if (!list_empty(&cilpcp->log_items)) list_splice_init(&cilpcp->log_items, &ctx->log_items); @@ -165,7 +164,13 @@ xlog_cil_insert_pcp_aggregate( if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) return; - for_each_online_cpu(cpu) { + /* + * We can race with other cpus setting cil_pcpmask. However, we've + * atomically cleared PCP_SPACE which forces other threads to add to + * the global space used count. cil_pcpmask is a superset of cilpcp + * structures that could have a nonzero space_used. + */ + for_each_cpu(cpu, &ctx->cil_pcpmask) { int old, prev; cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); @@ -554,6 +559,7 @@ xlog_cil_insert_items( int iovhdr_res = 0, split_res = 0, ctx_res = 0; int space_used; int order; + unsigned int cpu_nr; struct xlog_cil_pcp *cilpcp; ASSERT(tp); @@ -577,7 +583,12 @@ xlog_cil_insert_items( * can't be scheduled away between split sample/update operations that * are done without outside locking to serialise them. */ - cilpcp = get_cpu_ptr(cil->xc_pcp); + cpu_nr = get_cpu(); + cilpcp = this_cpu_ptr(cil->xc_pcp); + + /* Tell the future push that there was work added by this CPU. */ + if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask)) + cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask); /* * We need to take the CIL checkpoint unit reservation on the first @@ -663,7 +674,7 @@ xlog_cil_insert_items( continue; list_add_tail(&lip->li_cil, &cilpcp->log_items); } - put_cpu_ptr(cilpcp); + put_cpu(); /* * If we've overrun the reservation, dump the tx details before we move @@ -696,76 +707,6 @@ xlog_cil_free_logvec( } } -static void -xlog_discard_endio_work( - struct work_struct *work) -{ - struct xfs_cil_ctx *ctx = - container_of(work, struct xfs_cil_ctx, discard_endio_work); - struct xfs_mount *mp = ctx->cil->xc_log->l_mp; - - xfs_extent_busy_clear(mp, &ctx->busy_extents, false); - kmem_free(ctx); -} - -/* - * Queue up the actual completion to a thread to avoid IRQ-safe locking for - * pagb_lock. Note that we need a unbounded workqueue, otherwise we might - * get the execution delayed up to 30 seconds for weird reasons. - */ -static void -xlog_discard_endio( - struct bio *bio) -{ - struct xfs_cil_ctx *ctx = bio->bi_private; - - INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); - queue_work(xfs_discard_wq, &ctx->discard_endio_work); - bio_put(bio); -} - -static void -xlog_discard_busy_extents( - struct xfs_mount *mp, - struct xfs_cil_ctx *ctx) -{ - struct list_head *list = &ctx->busy_extents; - struct xfs_extent_busy *busyp; - struct bio *bio = NULL; - struct blk_plug plug; - int error = 0; - - ASSERT(xfs_has_discard(mp)); - - blk_start_plug(&plug); - list_for_each_entry(busyp, list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); - - error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), - XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, &bio); - if (error && error != -EOPNOTSUPP) { - xfs_info(mp, - "discard failed for extent [0x%llx,%u], error %d", - (unsigned long long)busyp->bno, - busyp->length, - error); - break; - } - } - - if (bio) { - bio->bi_private = ctx; - bio->bi_end_io = xlog_discard_endio; - submit_bio(bio); - } else { - xlog_discard_endio_work(&ctx->discard_endio_work); - } - blk_finish_plug(&plug); -} - /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as @@ -795,8 +736,8 @@ xlog_cil_committed( xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, ctx->start_lsn, abort); - xfs_extent_busy_sort(&ctx->busy_extents); - xfs_extent_busy_clear(mp, &ctx->busy_extents, + xfs_extent_busy_sort(&ctx->busy_extents.extent_list); + xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); @@ -805,10 +746,14 @@ xlog_cil_committed( xlog_cil_free_logvec(&ctx->lv_chain); - if (!list_empty(&ctx->busy_extents)) - xlog_discard_busy_extents(mp, ctx); - else - kmem_free(ctx); + if (!list_empty(&ctx->busy_extents.extent_list)) { + ctx->busy_extents.mount = mp; + ctx->busy_extents.owner = ctx; + xfs_discard_extents(mp, &ctx->busy_extents); + return; + } + + kmem_free(ctx); } void @@ -1791,38 +1736,6 @@ out_shutdown: } /* - * Move dead percpu state to the relevant CIL context structures. - * - * We have to lock the CIL context here to ensure that nothing is modifying - * the percpu state, either addition or removal. Both of these are done under - * the CIL context lock, so grabbing that exclusively here will ensure we can - * safely drain the cilpcp for the CPU that is dying. - */ -void -xlog_cil_pcp_dead( - struct xlog *log, - unsigned int cpu) -{ - struct xfs_cil *cil = log->l_cilp; - struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); - struct xfs_cil_ctx *ctx; - - down_write(&cil->xc_ctx_lock); - ctx = cil->xc_ctx; - if (ctx->ticket) - ctx->ticket->t_curr_res += cilpcp->space_reserved; - cilpcp->space_reserved = 0; - - if (!list_empty(&cilpcp->log_items)) - list_splice_init(&cilpcp->log_items, &ctx->log_items); - if (!list_empty(&cilpcp->busy_extents)) - list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents); - atomic_add(cilpcp->space_used, &ctx->space_used); - cilpcp->space_used = 0; - up_write(&cil->xc_ctx_lock); -} - -/* * Perform initial CIL structure initialisation. */ int diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1bd2963e8fbd..fa3ad1d7b31c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -6,6 +6,8 @@ #ifndef __XFS_LOG_PRIV_H__ #define __XFS_LOG_PRIV_H__ +#include "xfs_extent_busy.h" /* for struct xfs_busy_extents */ + struct xfs_buf; struct xlog; struct xlog_ticket; @@ -223,14 +225,19 @@ struct xfs_cil_ctx { struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ atomic_t space_used; /* aggregate size of regions */ - struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_busy_extents busy_extents; struct list_head log_items; /* log items in chkpt */ struct list_head lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ - struct work_struct discard_endio_work; struct work_struct push_work; atomic_t order_id; + + /* + * CPUs that could have added items to the percpu CIL data. Access is + * coordinated with xc_ctx_lock. + */ + struct cpumask cil_pcpmask; }; /* @@ -278,9 +285,6 @@ struct xfs_cil { wait_queue_head_t xc_push_wait; /* background push throttle */ void __percpu *xc_pcp; /* percpu CIL structures */ -#ifdef CONFIG_HOTPLUG_CPU - struct list_head xc_pcp_list; -#endif } ____cacheline_aligned_in_smp; /* xc_flags bit values */ @@ -705,9 +709,4 @@ xlog_kvmalloc( return p; } -/* - * CIL CPU dead notifier - */ -void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu); - #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 82c81d20459d..13b94d2e605b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -329,7 +329,7 @@ xlog_find_verify_cycle( * try a smaller size. We need to be able to read at least * a log sector, or we're out of luck. */ - bufblks = 1 << ffs(nbblks); + bufblks = roundup_pow_of_two(nbblks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { @@ -1528,7 +1528,7 @@ xlog_write_log_records( * a smaller size. We need to be able to write at least a * log sector, or we're out of luck. */ - bufblks = 1 << ffs(blocks); + bufblks = roundup_pow_of_two(blocks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a25eece3be2b..d19cca099bc3 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -60,6 +60,7 @@ struct xfs_error_cfg { * Per-cpu deferred inode inactivation GC lists. */ struct xfs_inodegc { + struct xfs_mount *mp; struct llist_head list; struct delayed_work work; int error; @@ -67,9 +68,7 @@ struct xfs_inodegc { /* approximate count of inodes in the list */ unsigned int items; unsigned int shrinker_hits; -#if defined(DEBUG) || defined(XFS_WARN) unsigned int cpu; -#endif }; /* @@ -98,7 +97,6 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ - struct list_head m_mount_list; /* global mount list */ void __percpu *m_inodegc; /* percpu inodegc structures */ /* @@ -249,6 +247,9 @@ typedef struct xfs_mount { unsigned int *m_errortag; struct xfs_kobj m_errortag_kobj; #endif + + /* cpus that have inodes queued for inactivation */ + struct cpumask m_inodegc_cpumask; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) @@ -404,6 +405,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_SHRINK 8 /* Kernel has logged a warning about logged xattr updates being used. */ #define XFS_OPSTATE_WARNED_LARP 9 +/* Mount time quotacheck is running */ +#define XFS_OPSTATE_QUOTACHECK_RUNNING 10 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -426,6 +429,11 @@ __XFS_IS_OPSTATE(inode32, INODE32) __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) +#ifdef CONFIG_XFS_QUOTA +__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) +#else +# define xfs_is_quotacheck_running(mp) (false) +#endif static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -443,7 +451,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ - { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } + { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ + { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" } /* * Max and min values for mount-option defined I/O diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 4a9bbd3fe120..a7daa522e00f 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -126,8 +126,8 @@ xfs_dax_notify_ddev_failure( struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; struct xfs_agf *agf; - xfs_agblock_t agend; struct xfs_perag *pag; + xfs_agblock_t range_agend; pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); @@ -148,10 +148,10 @@ xfs_dax_notify_ddev_failure( ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); agf = agf_bp->b_addr; - agend = min(be32_to_cpu(agf->agf_length), + range_agend = min(be32_to_cpu(agf->agf_length) - 1, ri_high.rm_startblock); notify.startblock = ri_low.rm_startblock; - notify.blockcount = agend - ri_low.rm_startblock; + notify.blockcount = range_agend + 1 - ri_low.rm_startblock; error = xfs_rmap_query_range(cur, &ri_low, &ri_high, xfs_dax_failure_fn, ¬ify); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6abcc34fafd8..086e78a6143a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1160,6 +1160,19 @@ xfs_qm_dqusage_adjust( if (error) return error; + /* + * Reload the incore unlinked list to avoid failure in inodegc. + * Use an unlocked check here because unrecovered unlinked inodes + * should be somewhat rare. + */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked(ip); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto error0; + } + } + ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { @@ -1173,6 +1186,7 @@ xfs_qm_dqusage_adjust( } nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks; + xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED); /* * Add the (disk blocks and inode) resources occupied by this @@ -1319,8 +1333,10 @@ xfs_qm_quotacheck( flags |= XFS_PQUOTA_CHKD; } + xfs_set_quotacheck_running(mp); error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); + xfs_clear_quotacheck_running(mp); /* * On error, the inode walk may have partially populated the dquot diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index edd8587658d5..2d4444d61e98 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -477,6 +477,7 @@ xfs_cui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_cud_log_item *cudp; struct xfs_trans *tp; @@ -514,8 +515,9 @@ xfs_cui_item_recover( * doesn't fit. We need to reserve enough blocks to handle a * full btree split on either end of the refcount range. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, mp->m_refc_maxlevels * 2, 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 520c7ebdfed8..0e0e747028da 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -507,6 +507,7 @@ xfs_rui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_rud_log_item *rudp; struct xfs_trans *tp; @@ -530,8 +531,9 @@ xfs_rui_item_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, mp->m_rmap_maxlevels, 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; rudp = xfs_trans_get_rud(tp, ruip); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1f77014c6e1a..819a3568b28f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -56,28 +56,6 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif -#ifdef CONFIG_HOTPLUG_CPU -static LIST_HEAD(xfs_mount_list); -static DEFINE_SPINLOCK(xfs_mount_list_lock); - -static inline void xfs_mount_list_add(struct xfs_mount *mp) -{ - spin_lock(&xfs_mount_list_lock); - list_add(&mp->m_mount_list, &xfs_mount_list); - spin_unlock(&xfs_mount_list_lock); -} - -static inline void xfs_mount_list_del(struct xfs_mount *mp) -{ - spin_lock(&xfs_mount_list_lock); - list_del(&mp->m_mount_list); - spin_unlock(&xfs_mount_list_lock); -} -#else /* !CONFIG_HOTPLUG_CPU */ -static inline void xfs_mount_list_add(struct xfs_mount *mp) {} -static inline void xfs_mount_list_del(struct xfs_mount *mp) {} -#endif - enum xfs_dax_mode { XFS_DAX_INODE = 0, XFS_DAX_ALWAYS = 1, @@ -1135,9 +1113,8 @@ xfs_inodegc_init_percpu( for_each_possible_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); -#if defined(DEBUG) || defined(XFS_WARN) gc->cpu = cpu; -#endif + gc->mp = mp; init_llist_head(&gc->list); gc->items = 0; gc->error = 0; @@ -1168,7 +1145,6 @@ xfs_fs_put_super( xfs_freesb(mp); xchk_mount_stats_free(mp); free_percpu(mp->m_stats.xs_stats); - xfs_mount_list_del(mp); xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1577,13 +1553,6 @@ xfs_fs_fill_super( if (error) goto out_destroy_counters; - /* - * All percpu data structures requiring cleanup when a cpu goes offline - * must be allocated before adding this @mp to the cpu-dead handler's - * mount list. - */ - xfs_mount_list_add(mp); - /* Allocate stats memory before we do operations that might use it */ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { @@ -1781,7 +1750,6 @@ xfs_fs_fill_super( out_free_stats: free_percpu(mp->m_stats.xs_stats); out_destroy_inodegc: - xfs_mount_list_del(mp); xfs_inodegc_free_percpu(mp); out_destroy_counters: xfs_destroy_percpu_counters(mp); @@ -2065,7 +2033,7 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("xfs"); @@ -2326,49 +2294,6 @@ xfs_destroy_workqueues(void) destroy_workqueue(xfs_alloc_wq); } -#ifdef CONFIG_HOTPLUG_CPU -static int -xfs_cpu_dead( - unsigned int cpu) -{ - struct xfs_mount *mp, *n; - - spin_lock(&xfs_mount_list_lock); - list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { - spin_unlock(&xfs_mount_list_lock); - xfs_inodegc_cpu_dead(mp, cpu); - xlog_cil_pcp_dead(mp->m_log, cpu); - spin_lock(&xfs_mount_list_lock); - } - spin_unlock(&xfs_mount_list_lock); - return 0; -} - -static int __init -xfs_cpu_hotplug_init(void) -{ - int error; - - error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL, - xfs_cpu_dead); - if (error < 0) - xfs_alert(NULL, -"Failed to initialise CPU hotplug, error %d. XFS is non-functional.", - error); - return error; -} - -static void -xfs_cpu_hotplug_destroy(void) -{ - cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD); -} - -#else /* !CONFIG_HOTPLUG_CPU */ -static inline int xfs_cpu_hotplug_init(void) { return 0; } -static inline void xfs_cpu_hotplug_destroy(void) {} -#endif - STATIC int __init init_xfs_fs(void) { @@ -2385,13 +2310,9 @@ init_xfs_fs(void) xfs_dir_startup(); - error = xfs_cpu_hotplug_init(); - if (error) - goto out; - error = xfs_init_caches(); if (error) - goto out_destroy_hp; + goto out; error = xfs_init_workqueues(); if (error) @@ -2475,8 +2396,6 @@ init_xfs_fs(void) xfs_destroy_workqueues(); out_destroy_caches: xfs_destroy_caches(); - out_destroy_hp: - xfs_cpu_hotplug_destroy(); out: return error; } @@ -2500,7 +2419,6 @@ exit_xfs_fs(void) xfs_destroy_workqueues(); xfs_destroy_caches(); xfs_uuid_table_free(); - xfs_cpu_hotplug_destroy(); } module_init(init_xfs_fs); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 902c7f67a117..3926cf7f2a6e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3824,6 +3824,51 @@ TRACE_EVENT(xfs_iunlink_update_dinode, __entry->new_ptr) ); +TRACE_EVENT(xfs_iunlink_reload_next, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->prev_agino = ip->i_prev_unlinked; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xfs_inode_reload_unlinked_bucket, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS) +); + DECLARE_EVENT_CLASS(xfs_ag_inode_class, TP_PROTO(struct xfs_inode *ip), TP_ARGS(ip), diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 43e5c219aaed..a3975f325f4e 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -46,6 +46,17 @@ xfs_attr_grab_log_assist( if (xfs_sb_version_haslogxattrs(&mp->m_sb)) return 0; + /* + * Check if the filesystem featureset is new enough to set this log + * incompat feature bit. Strictly speaking, the minimum requirement is + * a V5 filesystem for the superblock field, but we'll require rmap + * or reflink to avoid having to deal with really old kernels. + */ + if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) { + error = -EOPNOTSUPP; + goto drop_incompat; + } + /* Enable log-assisted xattrs. */ error = xfs_add_incompat_log_feature(mp, XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); |