diff options
Diffstat (limited to 'fs')
42 files changed, 1288 insertions, 741 deletions
diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 2fe402483ad5..30b066299d39 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -740,10 +740,22 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, { struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); - int seq = 0; + struct key *key; + int ret, seq = 0; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); + if (!(query_flags & AT_STATX_DONT_SYNC) && + !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + ret = afs_validate(vnode, key); + key_put(key); + if (ret < 0) + return ret; + } + do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); generic_fillattr(&init_user_ns, inode, stat); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 47e72d72f7d0..32131a5d321b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -384,6 +384,17 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) return ret; } +/* + * Check if the inode has flags compatible with compression + */ +static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) + return false; + return true; +} + struct btrfs_dio_private { struct inode *inode; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b7631b88426e..077c95e9baa5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1060,6 +1060,7 @@ struct btrfs_fs_info { */ spinlock_t relocation_bg_lock; u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; u64 nr_global_roots; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 71fd99b48283..f26202621989 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -734,7 +734,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* Commit dev_replace state and reserve 1 item for it. */ + /* + * Commit dev_replace state and reserve 1 item for it. + * This is crucial to ensure we won't miss copying extents for new block + * groups that are allocated after we started the device replace, and + * must be done after setting up the device replace state. + */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 126f244cdf88..31c3f592e587 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3157,6 +3157,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); + mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -3657,6 +3658,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; + /* + * V1 space cache has some hardcoded PAGE_SIZE usage, and is + * going to be deprecated. + * + * Force to use v2 cache for subpage case. + */ + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(fs_info, FREE_SPACE_TREE, + "forcing free space tree for sector size %u with page size %lu", + sectorsize, PAGE_SIZE); + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 724e8fe06aa0..33c19f51d79b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2658,6 +2658,7 @@ int btrfs_repair_one_sector(struct inode *inode, repair_bio = btrfs_bio_alloc(1); repair_bbio = btrfs_bio(repair_bio); + repair_bbio->file_offset = start; repair_bio->bi_opf = REQ_OP_READ; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; @@ -3333,24 +3334,37 @@ static int alloc_new_bio(struct btrfs_inode *inode, ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; - if (wbc) { - struct block_device *bdev; - bdev = fs_info->fs_devices->latest_dev->bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - } - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; + if (wbc) { + /* + * For Zone append we need the correct block_device that we are + * going to write to set in the bio to be able to respect the + * hardware limitation. Look it up here: + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *dev; + + dev = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto error; + } - device = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; + bio_set_dev(bio, dev->bdev); + } else { + /* + * Otherwise pick the last added device to support + * cgroup writeback. For multi-device file systems this + * means blk-cgroup policies have to always be set on the + * last added/replaced device. This is a bit odd but has + * been like that for a long time. + */ + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); } - - btrfs_bio(bio)->device = device; + wbc_init_bio(wbc, bio); + } else { + ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } return 0; error: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5082b9c70f8c..95c499b8424e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -481,17 +481,6 @@ static noinline int add_async_extent(struct async_chunk *cow, } /* - * Check if the inode has flags compatible with compression - */ -static inline bool inode_can_compress(struct btrfs_inode *inode) -{ - if (inode->flags & BTRFS_INODE_NODATACOW || - inode->flags & BTRFS_INODE_NODATASUM) - return false; - return true; -} - -/* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ @@ -500,7 +489,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, { struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (!inode_can_compress(inode)) { + if (!btrfs_inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", btrfs_ino(inode)); @@ -2019,7 +2008,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); - } else if (!inode_can_compress(inode) || + } else if (!btrfs_inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { if (zoned) ret = run_delalloc_zoned(inode, locked_page, start, end, @@ -7810,8 +7799,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - const u64 orig_file_offset = dip->file_offset; - u64 start = orig_file_offset; u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; @@ -7821,6 +7808,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); pgoff = bvec.bv_offset; for (i = 0; i < nr_sectors; i++) { + u64 start = bbio->file_offset + bio_offset; + ASSERT(pgoff < PAGE_SIZE); if (uptodate && (!csum || !check_data_csum(inode, bbio, @@ -7833,17 +7822,13 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, } else { int ret; - ASSERT((start - orig_file_offset) < UINT_MAX); - ret = btrfs_repair_one_sector(inode, - &bbio->bio, - start - orig_file_offset, - bvec.bv_page, pgoff, + ret = btrfs_repair_one_sector(inode, &bbio->bio, + bio_offset, bvec.bv_page, pgoff, start, bbio->mirror_num, submit_dio_repair_bio); if (ret) err = errno_to_blk_status(ret); } - start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); bio_offset += sectorsize; pgoff += sectorsize; @@ -7870,6 +7855,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t err = bio->bi_status; if (err) @@ -7880,12 +7866,12 @@ static void btrfs_end_dio_bio(struct bio *bio) bio->bi_iter.bi_size, err); if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err); + err = btrfs_check_read_dio_bio(dip, bbio, !err); if (err) dip->dio_bio->bi_status = err; - btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio); + btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); @@ -8046,6 +8032,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_bio(bio)->file_offset = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 1a6d2d5b4b33..1b31481f9e72 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -17,9 +17,11 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); struct prop_handler { struct hlist_node node; const char *xattr_name; - int (*validate)(const char *value, size_t len); + int (*validate)(const struct btrfs_inode *inode, const char *value, + size_t len); int (*apply)(struct inode *inode, const char *value, size_t len); const char *(*extract)(struct inode *inode); + bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -55,7 +57,8 @@ find_prop_handler(const char *name, return NULL; } -int btrfs_validate_prop(const char *name, const char *value, size_t value_len) +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len) { const struct prop_handler *handler; @@ -69,7 +72,29 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len) if (value_len == 0) return 0; - return handler->validate(value, value_len); + return handler->validate(inode, value, value_len); +} + +/* + * Check if a property should be ignored (not set) for an inode. + * + * @inode: The target inode. + * @name: The property's name. + * + * The caller must be sure the given property name is valid, for example by + * having previously called btrfs_validate_prop(). + * + * Returns: true if the property should be ignored for the given inode + * false if the property must not be ignored for the given inode + */ +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name) +{ + const struct prop_handler *handler; + + handler = find_prop_handler(name, NULL); + ASSERT(handler != NULL); + + return handler->ignore(inode); } int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, @@ -252,8 +277,12 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) return ret; } -static int prop_compression_validate(const char *value, size_t len) +static int prop_compression_validate(const struct btrfs_inode *inode, + const char *value, size_t len) { + if (!btrfs_inode_can_compress(inode)) + return -EINVAL; + if (!value) return 0; @@ -310,6 +339,22 @@ static int prop_compression_apply(struct inode *inode, const char *value, return 0; } +static bool prop_compression_ignore(const struct btrfs_inode *inode) +{ + /* + * Compression only has effect for regular files, and for directories + * we set it just to propagate it to new files created inside them. + * Everything else (symlinks, devices, sockets, fifos) is pointless as + * it will do nothing, so don't waste metadata space on a compression + * xattr for anything that is neither a file nor a directory. + */ + if (!S_ISREG(inode->vfs_inode.i_mode) && + !S_ISDIR(inode->vfs_inode.i_mode)) + return true; + + return false; +} + static const char *prop_compression_extract(struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { @@ -330,6 +375,7 @@ static struct prop_handler prop_handlers[] = { .validate = prop_compression_validate, .apply = prop_compression_apply, .extract = prop_compression_extract, + .ignore = prop_compression_ignore, .inheritable = 1 }, }; @@ -356,6 +402,9 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; + if (h->ignore(BTRFS_I(inode))) + continue; + value = h->extract(parent); if (!value) continue; @@ -364,7 +413,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(value, strlen(value)); + ret = h->validate(BTRFS_I(inode), value, strlen(value)); if (ret) continue; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 40b2c65b518c..59bea741cfcf 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -13,7 +13,9 @@ void __init btrfs_props_init(void); int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, int flags); -int btrfs_validate_prop(const char *name, const char *value, size_t value_len); +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len); +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 11089568b287..8cd713d37ad2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3699,6 +3699,31 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + ASSERT(cache->start <= chunk_offset); + /* + * We are using the commit root to search for device extents, so + * that means we could have found a device extent item from a + * block group that was deleted in the current transaction. The + * logical start offset of the deleted block group, stored at + * @chunk_offset, might be part of the logical address range of + * a new block group (which uses different physical extents). + * In this case btrfs_lookup_block_group() has returned the new + * block group, and its start address is less than @chunk_offset. + * + * We skip such new block groups, because it's pointless to + * process them, as we won't find their extents because we search + * for them using the commit root of the extent tree. For a device + * replace it's also fine to skip it, we won't miss copying them + * to the target device because we have the write duplication + * setup through the regular write path (by btrfs_map_block()), + * and we have committed a transaction when we started the device + * replace, right after setting up the device replace state. + */ + if (cache->start < chunk_offset) { + btrfs_put_block_group(cache); + goto skip; + } + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { spin_lock(&cache->lock); if (!cache->to_copy) { @@ -3822,7 +3847,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); - ASSERT(cache->start == chunk_offset); ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, dev_extent_len); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 17389a42a3ab..ba78ca5aabbb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -922,6 +922,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, case BTRFS_EXCLOP_BALANCE: str = "balance\n"; break; + case BTRFS_EXCLOP_BALANCE_PAUSED: + str = "balance paused\n"; + break; case BTRFS_EXCLOP_DEV_ADD: str = "device add\n"; break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 571dae8ad65e..e65633686378 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3188,6 +3188,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_alloc_log_tree_node(trans, log_root_tree); if (ret) { mutex_unlock(&fs_info->tree_root->log_mutex); + blk_finish_plug(&plug); goto out; } } @@ -3720,11 +3721,29 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - if (ret) + /* + * -EEXIST is fine and can happen sporadically when we are logging a + * directory and have concurrent insertions in the subvolume's tree for + * items from other inodes and that result in pushing off some dir items + * from one leaf to another in order to accommodate for the new items. + * This results in logging the same dir index range key. + */ + if (ret && ret != -EEXIST) return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); + if (ret == -EEXIST) { + const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); + + /* + * btrfs_del_dir_entries_in_log() might have been called during + * an unlink between the initial insertion of this key and the + * current update, or we might be logging a single entry deletion + * during a rename, so set the new last_offset to the max value. + */ + last_offset = max(last_offset, curr_end); + } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); @@ -3848,13 +3867,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, ret = insert_dir_log_key(trans, log, dst_path, ino, *last_old_dentry_offset + 1, key.offset - 1); - /* - * -EEXIST should never happen because when we - * log a directory in full mode (LOG_INODE_ALL) - * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from - * the log tree. - */ - ASSERT(ret != -EEXIST); if (ret < 0) return ret; } @@ -5804,6 +5816,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * For symlinks, we must always log their content, which is stored in an + * inline extent, otherwise we could end up with an empty symlink after + * log replay, which is invalid on linux (symlink(2) returns -ENOENT if + * one attempts to create an empty symlink). + * We don't need to worry about flushing delalloc, because when we create + * the inline extent when the symlink is created (we never have delalloc + * for symlinks). + */ + if (S_ISLNK(inode->vfs_inode.i_mode)) + inode_only = LOG_INODE_ALL; + + /* * Before logging the inode item, cache the value returned by * inode_logged(), because after that we have the need to figure out if * the inode was previously logged in this transaction. @@ -6181,7 +6205,7 @@ again: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) + if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); @@ -7018,12 +7042,12 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, /* * Other concurrent task might be logging the old directory, * as it can be triggered when logging other inode that had or - * still has a dentry in the old directory. So take the old - * directory's log_mutex to prevent getting an -EEXIST when - * logging a key to record the deletion, or having that other - * task logging the old directory get an -EEXIST if it attempts - * to log the same key after we just did it. In both cases that - * would result in falling back to a transaction commit. + * still has a dentry in the old directory. We lock the old + * directory's log_mutex to ensure the deletion of the old + * name is persisted, because during directory logging we + * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of + * the old name's dir index item is in the delayed items, so + * it could be missed by an in progress directory logging. */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bd297f23d19e..f3e28f11cfb6 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -328,6 +328,9 @@ struct btrfs_fs_devices { struct btrfs_bio { unsigned int mirror_num; + /* for direct I/O */ + u64 file_offset; + /* @device is for stripe IO submission. */ struct btrfs_device *device; u8 *csum; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 99abf41b89b9..85691dc2232f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -262,7 +262,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); out: if (start_trans) btrfs_end_transaction(trans); @@ -403,10 +404,13 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct btrfs_root *root = BTRFS_I(inode)->root; name = xattr_full_name(handler, name); - ret = btrfs_validate_prop(name, value, size); + ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size); if (ret) return ret; + if (btrfs_ignore_prop(BTRFS_I(inode), name)) + return 0; + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -416,7 +420,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); } btrfs_end_transaction(trans); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1b1b310c3c51..d31b0eda210f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1835,6 +1835,12 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } + /* No space left */ + if (block_group->alloc_offset == block_group->zone_capacity) { + ret = false; + goto out_unlock; + } + for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; physical = map->stripes[i].physical; @@ -1842,35 +1848,23 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) if (device->zone_info->max_active_zones == 0) continue; - /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { - ret = false; - goto out_unlock; - } - if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; goto out_unlock; } - - /* Successfully activated all the zones */ - if (i == map->num_stripes - 1) - block_group->zone_is_active = 1; - - } + + /* Successfully activated all the zones */ + block_group->zone_is_active = 1; spin_unlock(&block_group->lock); - if (block_group->zone_is_active) { - /* For the active block group list */ - btrfs_get_block_group(block_group); + /* For the active block group list */ + btrfs_get_block_group(block_group); - spin_lock(&fs_info->zone_active_bgs_lock); - list_add_tail(&block_group->active_bg_list, - &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); - } + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); return true; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index cbf016a7bb5d..6dee76248cb4 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -359,7 +359,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_lock(&inode->vfs_inode, 0); + mutex_lock(&root->fs_info->zoned_data_reloc_io_lock); } static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) @@ -367,7 +367,7 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_unlock(&inode->vfs_inode, 0); + mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); } #endif diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index aa25bffd4823..b6edcf89a429 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -85,7 +85,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) if (folio_test_dirty(folio)) { dout("%p dirty_folio %p idx %lu -- already dirty\n", mapping->host, folio, folio->index); - BUG_ON(!folio_get_private(folio)); + VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); return false; } @@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) * Reference snap context in folio->private. Also set * PagePrivate so that we get invalidate_folio callback. */ - BUG_ON(folio_get_private(folio)); + VM_BUG_ON_FOLIO(folio_test_private(folio), folio); folio_attach_private(folio, snapc); return ceph_fscache_dirty_folio(mapping, folio); @@ -150,7 +150,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, } WARN_ON(!folio_test_locked(folio)); - if (folio_get_private(folio)) { + if (folio_test_private(folio)) { dout("%p invalidate_folio idx %lu full dirty page\n", inode, folio->index); @@ -729,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req) /* clean all pages */ for (i = 0; i < req->r_num_ops; i++) { - if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { + pr_warn("%s incorrect op %d req %p index %d tid %llu\n", + __func__, req->r_ops[i].op, req, i, req->r_tid); break; + } osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f1ad6884d4da..5c14ef04e474 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2274,6 +2274,8 @@ retry: list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; + if (!s) + continue; if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); for (i = 0; i < max_sessions; i++) { @@ -2294,6 +2296,8 @@ retry: list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; + if (!s) + continue; if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); for (i = 0; i < max_sessions; i++) { @@ -3870,6 +3874,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", inode, ci, mds, mseq, target); retry: + down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) @@ -3933,6 +3938,7 @@ retry: } spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); /* open target session */ @@ -3958,6 +3964,7 @@ retry: out_unlock: spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); if (tsession) { mutex_unlock(&tsession->s_mutex); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6c9e837aa1d3..8c8226c0feac 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -629,9 +629,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, iinfo.change_attr = 1; ceph_encode_timespec64(&iinfo.btime, &now); - iinfo.xattr_len = ARRAY_SIZE(xattr_buf); - iinfo.xattr_data = xattr_buf; - memset(iinfo.xattr_data, 0, iinfo.xattr_len); + if (req->r_pagelist) { + iinfo.xattr_len = req->r_pagelist->length; + iinfo.xattr_data = req->r_pagelist->mapped_tail; + } else { + /* fake it */ + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + } in.ino = cpu_to_le64(vino.ino); in.snapid = cpu_to_le64(CEPH_NOSNAP); @@ -743,6 +749,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out_ctx; + /* Async create can't handle more than a page of xattrs */ + if (as_ctx.pagelist && + !list_is_singular(&as_ctx.pagelist->head)) + try_async = false; } else if (!d_in_lookup(dentry)) { /* If it's not being looked up, it's negative */ return -ENOENT; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fa38c013126d..00c3de177dd6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4434,8 +4434,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { - struct ceph_fs_client *fsc = s->s_mdsc->fsc; - switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { @@ -4444,10 +4442,6 @@ bool check_session_state(struct ceph_mds_session *s) } break; case CEPH_MDS_SESSION_CLOSING: - /* Should never reach this when not force unmounting */ - WARN_ON_ONCE(s->s_ttl && - READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); - fallthrough; case CEPH_MDS_SESSION_NEW: case CEPH_MDS_SESSION_RESTARTING: case CEPH_MDS_SESSION_CLOSED: diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f5366feea82d..909085a78f9c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,9 +98,9 @@ repeat: } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs && - sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) { - set_ckpt_flags(sbi, CP_ERROR_FLAG); + if (page->index == sbi->metapage_eio_ofs) { + if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); } else { sbi->metapage_eio_ofs = page->index; sbi->metapage_eio_cnt = 0; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8e0c2e773c8d..9a1a526f2092 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -388,11 +388,23 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) +static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int fua_flag = io_flag & temp_mask; - unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + unsigned int fua_flag, meta_flag, io_flag; + unsigned int op_flags = 0; + + if (fio->op != REQ_OP_WRITE) + return 0; + if (fio->type == DATA) + io_flag = fio->sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = fio->sbi->node_io_flag; + else + return 0; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; /* * data/node io flag bits per temp: @@ -401,9 +413,10 @@ static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) * Cold | Warm | Hot | Cold | Warm | Hot | */ if ((1 << fio->temp) & meta_flag) - fio->op_flags |= REQ_META; + op_flags |= REQ_META; if ((1 << fio->temp) & fua_flag) - fio->op_flags |= REQ_FUA; + op_flags |= REQ_FUA; + return op_flags; } static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) @@ -413,14 +426,10 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) sector_t sector; struct bio *bio; - if (fio->type == DATA) - __attach_io_flag(fio, sbi->data_io_flag); - else if (fio->type == NODE) - __attach_io_flag(fio, sbi->node_io_flag); - bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); - bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags, GFP_NOIO, - &f2fs_bioset); + bio = bio_alloc_bioset(bdev, npages, + fio->op | fio->op_flags | f2fs_io_flags(fio), + GFP_NOIO, &f2fs_bioset); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cd1e65bcf0b0..8c570de21ed5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -154,7 +154,6 @@ struct f2fs_mount_info { int s_jquota_fmt; /* Format of quota to use */ #endif /* For which write hints are passed down to block layer */ - int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ @@ -1334,12 +1333,6 @@ enum { }; enum { - WHINT_MODE_OFF, /* not pass down write hints */ - WHINT_MODE_USER, /* try to pass down hints given by users */ - WHINT_MODE_FS, /* pass down hints with F2FS policy */ -}; - -enum { ALLOC_MODE_DEFAULT, /* stay default */ ALLOC_MODE_REUSE, /* reuse segments as much as possible */ }; @@ -3657,8 +3650,6 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp); unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int segno); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 71f232dcf3c2..83639238a1fe 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -550,7 +550,8 @@ make_now: } f2fs_set_inode_flags(inode); - if (file_should_truncate(inode)) { + if (file_should_truncate(inode) && + !is_sbi_flag_set(sbi, SBI_POR_DOING)) { ret = f2fs_truncate(inode); if (ret) goto bad_inode; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 22dfeb991529..bd9731cdec56 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3243,101 +3243,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint) } } -/* This returns write hints for each segment type. This hints will be - * passed down to block layer. There are mapping tables which depend on - * the mount option 'whint_mode'. - * - * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - * - * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_NOT_SET - * HOT_NODE " - * WARM_NODE " - * COLD_NODE " - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - * - * 3) whint_mode=fs-based. F2FS passes down hints with its policy. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_MEDIUM; - * HOT_NODE WRITE_LIFE_NOT_SET - * WARM_NODE " - * COLD_NODE WRITE_LIFE_NONE - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - */ - -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp) -{ - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_NOT_SET; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else { - return WRITE_LIFE_NOT_SET; - } - } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_LONG; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else if (type == NODE) { - if (temp == WARM || temp == HOT) - return WRITE_LIFE_NOT_SET; - else if (temp == COLD) - return WRITE_LIFE_NONE; - } else if (type == META) { - return WRITE_LIFE_MEDIUM; - } - } - return WRITE_LIFE_NOT_SET; -} - static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ea939db18f88..4368f90571bd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -138,7 +138,6 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, - Opt_whint, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, @@ -214,7 +213,6 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, @@ -975,22 +973,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "quota operations not supported"); break; #endif - case Opt_whint: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "user-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; - } else if (!strcmp(name, "off")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - } else if (!strcmp(name, "fs-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; case Opt_alloc: name = match_strdup(&args[0]); if (!name) @@ -1328,12 +1310,6 @@ default_check: return -EINVAL; } - /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_PERSIST_TYPE. - */ - if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE) - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1978,10 +1954,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) - seq_printf(seq, ",whint_mode=%s", "user-based"); - else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) - seq_printf(seq, ",whint_mode=%s", "fs-based"); fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); @@ -2033,7 +2005,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); @@ -2314,8 +2285,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY || - F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { + if (*flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 591fe9cf1659..1fae0196292a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1712,6 +1712,10 @@ static int writeback_single_inode(struct inode *inode, */ if (!(inode->i_state & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); + else if (!(inode->i_state & I_SYNC_QUEUED) && + (inode->i_state & I_DIRTY)) + redirty_tail_locked(inode, wb); + spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 39080b2d6cf8..b6697333bb2b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1153,13 +1153,12 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (length != written && (iomap->flags & IOMAP_F_NEW)) { /* Deallocate blocks that were just allocated. */ - loff_t blockmask = i_blocksize(inode) - 1; - loff_t end = (pos + length) & ~blockmask; + loff_t hstart = round_up(pos + written, i_blocksize(inode)); + loff_t hend = iomap->offset + iomap->length; - pos = (pos + written + blockmask) & ~blockmask; - if (pos < end) { - truncate_pagecache_range(inode, pos, end - 1); - punch_hole(ip, pos, end - pos); + if (hstart < hend) { + truncate_pagecache_range(inode, hstart, hend - 1); + punch_hole(ip, hstart, hend - hstart); } } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 22b41acfbbc3..2556ae1f92ea 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -770,30 +770,27 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret ? ret : ret1; } -static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, +static inline bool should_fault_in_pages(struct iov_iter *i, + struct kiocb *iocb, size_t *prev_count, size_t *window_size) { size_t count = iov_iter_count(i); size_t size, offs; - if (likely(!count)) - return false; - if (ret <= 0 && ret != -EFAULT) + if (!count) return false; if (!iter_is_iovec(i)) return false; size = PAGE_SIZE; - offs = offset_in_page(i->iov[0].iov_base + i->iov_offset); + offs = offset_in_page(iocb->ki_pos); if (*prev_count != count || !*window_size) { size_t nr_dirtied; - size = ALIGN(offs + count, PAGE_SIZE); - size = min_t(size_t, size, SZ_1M); nr_dirtied = max(current->nr_dirtied_pause - current->nr_dirtied, 8); - size = min(size, nr_dirtied << PAGE_SHIFT); + size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT); } *prev_count = count; @@ -807,7 +804,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, struct file *file = iocb->ki_filp; struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -835,35 +832,31 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); to->nofault = true; ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, written); + IOMAP_DIO_PARTIAL, read); to->nofault = false; pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; if (ret > 0) - written = ret; - - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; + read = ret; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + window_size -= fault_in_iov_iter_writeable(to, window_size); + if (window_size) goto retry; - } } +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); if (ret < 0) return ret; - return written; + return read; } static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, @@ -873,7 +866,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct inode *inode = file->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); size_t prev_count = 0, window_size = 0; - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -899,41 +892,37 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: /* Silently fall back to buffered I/O when writing beyond EOF */ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) - goto out; + goto out_unlock; from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, read); + IOMAP_DIO_PARTIAL, written); from->nofault = false; - - if (ret == -ENOTBLK) - ret = 0; + if (ret <= 0) { + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EFAULT) + goto out_unlock; + } if (ret > 0) - read = ret; - - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; + written = ret; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + window_size -= fault_in_iov_iter_readable(from, window_size); + if (window_size) goto retry; - } } -out: +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); if (ret < 0) return ret; - return read; + return written; } static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -941,7 +930,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct gfs2_inode *ip; struct gfs2_holder gh; size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -962,7 +951,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (ret >= 0) { if (!iov_iter_count(to)) return ret; - written = ret; + read = ret; } else if (ret != -EFAULT) { if (ret != -EAGAIN) return ret; @@ -975,32 +964,26 @@ retry: ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); ret = generic_file_read_iter(iocb, to); pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; if (ret > 0) - written += ret; + read += ret; - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; - - gfs2_holder_allow_demote(&gh); - leftover = fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(&gh); - if (leftover != window_size) { - if (gfs2_holder_queued(&gh)) - goto retry_under_glock; - if (written) - goto out_uninit; + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(&gh); + window_size -= fault_in_iov_iter_writeable(to, window_size); + if (window_size) goto retry; - } } +out_unlock: if (gfs2_holder_queued(&gh)) gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - return written ? written : ret; + return read ? read : ret; } static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, @@ -1014,7 +997,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct gfs2_holder *statfs_gh = NULL; size_t prev_count = 0, window_size = 0; size_t orig_count = iov_iter_count(from); - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -1032,10 +1015,18 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); retry: + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + window_size -= fault_in_iov_iter_readable(from, window_size); + if (!window_size) { + ret = -EFAULT; + goto out_uninit; + } + from->count = min(from->count, window_size); + } ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: + if (inode == sdp->sd_rindex) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -1052,27 +1043,19 @@ retry_under_glock: current->backing_dev_info = NULL; if (ret > 0) { iocb->ki_pos += ret; - read += ret; + written += ret; } if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); - from->count = orig_count - read; - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; - - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - from->count = min(from->count, window_size - leftover); - if (gfs2_holder_queued(gh)) - goto retry_under_glock; - if (read && !(iocb->ki_flags & IOCB_DIRECT)) - goto out_uninit; - goto retry; - } + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; + + from->count = orig_count - written; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + goto retry; } out_unlock: if (gfs2_holder_queued(gh)) @@ -1081,8 +1064,8 @@ out_uninit: gfs2_holder_uninit(gh); if (statfs_gh) kfree(statfs_gh); - from->count = orig_count - read; - return read ? read : ret; + from->count = orig_count - written; + return written ? written : ret; } /** diff --git a/fs/io-wq.c b/fs/io-wq.c index 32aeb2c581c5..824623bcf1a5 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, static bool io_wq_worker_wake(struct io_worker *worker, void *data) { - set_notify_signal(worker->task); + __set_notify_signal(worker->task); wake_up_process(worker->task); return false; } @@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, { if (work && match->fn(work, match->data)) { work->flags |= IO_WQ_WORK_CANCEL; - set_notify_signal(worker->task); + __set_notify_signal(worker->task); return true; } diff --git a/fs/io_uring.c b/fs/io_uring.c index fdd497f92a6e..e4c9f776919b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -95,7 +95,7 @@ #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 /* only define max */ -#define IORING_MAX_FIXED_FILES (1U << 15) +#define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -117,6 +117,8 @@ #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ IO_REQ_CLEAN_FLAGS) +#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) + #define IO_TCTX_REFS_CACHE_NR (1U << 10) struct io_uring { @@ -170,7 +172,7 @@ struct io_rings { * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ - u32 sq_flags; + atomic_t sq_flags; /* * Runtime CQ flags * @@ -258,6 +260,8 @@ struct io_rsrc_put { struct io_file_table { struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; }; struct io_rsrc_node { @@ -282,10 +286,26 @@ struct io_rsrc_data { bool quiesce; }; +#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) struct io_buffer_list { - struct list_head list; - struct list_head buf_list; + /* + * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, + * then these are classic provided buffers and ->buf_list is used. + */ + union { + struct list_head buf_list; + struct { + struct page **buf_pages; + struct io_uring_buf_ring *buf_ring; + }; + }; __u16 bgid; + + /* below is for ring provided buffers */ + __u16 buf_nr_pages; + __u16 nr_entries; + __u32 head; + __u32 mask; }; struct io_buffer { @@ -358,7 +378,7 @@ struct io_ev_fd { struct rcu_head rcu; }; -#define IO_BUFFERS_HASH_BITS 5 +#define BGID_ARRAY 64 struct io_ring_ctx { /* const or read-mostly hot data */ @@ -367,6 +387,7 @@ struct io_ring_ctx { struct io_rings *rings; unsigned int flags; + enum task_work_notify_mode notify_method; unsigned int compat: 1; unsigned int drain_next: 1; unsigned int restricted: 1; @@ -411,11 +432,14 @@ struct io_ring_ctx { struct io_mapped_ubuf **user_bufs; struct io_submit_state submit_state; + + struct io_buffer_list *io_bl; + struct xarray io_bl_xa; + struct list_head io_buffers_cache; + struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct list_head *io_buffers; - struct list_head io_buffers_cache; struct list_head apoll_cache; struct xarray personalities; u32 pers_next; @@ -617,7 +641,7 @@ struct io_rw { struct kiocb kiocb; u64 addr; u32 len; - u32 flags; + rwf_t flags; }; struct io_connect { @@ -634,9 +658,9 @@ struct io_sr_msg { void __user *buf; }; int msg_flags; - int bgid; size_t len; size_t done_io; + unsigned int flags; }; struct io_open { @@ -804,6 +828,7 @@ enum { REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, + REQ_F_BUFFER_RING_BIT, REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_CREDS_BIT, @@ -814,6 +839,7 @@ enum { REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, REQ_F_PARTIAL_IO_BIT, + REQ_F_APOLL_MULTISHOT_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -854,6 +880,8 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + /* buffer selected from ring, needs commit */ + REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ @@ -878,6 +906,8 @@ enum { REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), /* request has already done partial IO */ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + /* fast poll multishot mode */ + REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), }; struct async_poll { @@ -956,6 +986,11 @@ struct io_kiocb { u8 opcode; /* polled IO has completed */ u8 iopoll_completed; + /* + * Can be either a fixed buffer index, or used with provided buffers. + * For the latter, before issue it points to the buffer group ID, + * and after selection it points to the buffer ID itself. + */ u16 buf_index; unsigned int flags; @@ -965,14 +1000,26 @@ struct io_kiocb { struct task_struct *task; struct io_rsrc_node *rsrc_node; - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; + + union { + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; + + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; + + /* + * stores buffer ID for ring provided buffers, valid IFF + * REQ_F_BUFFER_RING is set. + */ + struct io_buffer_list *buf_list; + }; union { /* used by request caches, completion batching and iopoll */ struct io_wq_work_node comp_list; /* cache ->apoll->events */ - int apoll_events; + __poll_t apoll_events; }; atomic_t refs; atomic_t poll_refs; @@ -983,8 +1030,6 @@ struct io_kiocb { struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ - struct io_buffer *kbuf; /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ struct io_kiocb *link; /* custom credentials, valid IFF REQ_F_CREDS is set */ @@ -1035,12 +1080,20 @@ struct io_op_def { unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; /* size of async data needed, if any */ unsigned short async_size; }; static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = {}, + [IORING_OP_NOP] = { + .audit_skip = 1, + .iopoll = 1, + .buffer_select = 1, + }, [IORING_OP_READV] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -1049,6 +1102,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITEV] = { @@ -1059,6 +1114,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FSYNC] = { @@ -1071,6 +1128,8 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE_FIXED] = { @@ -1080,6 +1139,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_POLL_ADD] = { @@ -1122,6 +1183,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .poll_exclusive = 1, + .ioprio = 1, /* used for flags */ }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, @@ -1144,6 +1206,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_CLOSE] = {}, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_STATX] = { .audit_skip = 1, @@ -1155,6 +1218,8 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE] = { @@ -1164,6 +1229,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FADVISE] = { @@ -1198,9 +1265,11 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_TEE] = { .needs_file = 1, @@ -1218,6 +1287,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_LINKAT] = {}, [IORING_OP_MSG_RING] = { .needs_file = 1, + .iopoll = 1, }, [IORING_OP_FSETXATTR] = { .needs_file = 1 @@ -1249,7 +1319,7 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, static void io_clean_op(struct io_kiocb *req); static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd); +static struct file *io_file_get_normal(struct io_kiocb *req, int fd); static void io_drop_inflight_file(struct io_kiocb *req); static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); static void io_queue_sqe(struct io_kiocb *req); @@ -1442,21 +1512,23 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) { - struct io_buffer *kbuf = req->kbuf; - unsigned int cflags; + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) + req->buf_list->head++; + req->flags &= ~REQ_F_BUFFER_RING; + } else { + list_add(&req->kbuf->list, list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + } - cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); - req->flags &= ~REQ_F_BUFFER_SELECTED; - list_add(&kbuf->list, list); - req->kbuf = NULL; - return cflags; + return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); } static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) { lockdep_assert_held(&req->ctx->completion_lock); - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; return __io_put_kbuf(req, &req->ctx->io_buffers_comp); } @@ -1466,7 +1538,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, { unsigned int cflags; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; /* @@ -1481,7 +1553,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, * We migrate buffers from the comp_list to the issue cache list * when we need one. */ - if (issue_flags & IO_URING_F_UNLOCKED) { + if (req->flags & REQ_F_BUFFER_RING) { + /* no buffers to recycle for this case */ + cflags = __io_put_kbuf(req, NULL); + } else if (issue_flags & IO_URING_F_UNLOCKED) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); @@ -1499,15 +1574,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { - struct list_head *hash_list; - struct io_buffer_list *bl; - - hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; - list_for_each_entry(bl, hash_list, list) - if (bl->bgid == bgid || bgid == -1U) - return bl; + if (ctx->io_bl && bgid < BGID_ARRAY) + return &ctx->io_bl[bgid]; - return NULL; + return xa_load(&ctx->io_bl_xa, bgid); } static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) @@ -1516,11 +1586,23 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) struct io_buffer_list *bl; struct io_buffer *buf; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return; /* don't recycle if we already did IO to this buffer */ if (req->flags & REQ_F_PARTIAL_IO) return; + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + */ + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + return; + } io_ring_submit_lock(ctx, issue_flags); @@ -1528,7 +1610,7 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; - req->kbuf = NULL; + req->buf_index = buf->bgid; io_ring_submit_unlock(ctx, issue_flags); } @@ -1613,12 +1695,14 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int i, hash_bits; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; + xa_init(&ctx->io_bl_xa); + /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread. @@ -1640,13 +1724,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; - ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, - sizeof(struct list_head), GFP_KERNEL); - if (!ctx->io_buffers) - goto err; - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) - INIT_LIST_HEAD(&ctx->io_buffers[i]); - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1682,7 +1759,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); - kfree(ctx->io_buffers); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } @@ -2046,8 +2124,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) all_flushed = list_empty(&ctx->cq_overflow_list); if (all_flushed) { clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); + atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } io_commit_cqring(ctx); @@ -2141,8 +2218,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, } if (list_empty(&ctx->cq_overflow_list)) { set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); + atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } ocqe->cqe.user_data = user_data; @@ -2525,6 +2601,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (*locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); @@ -2641,8 +2719,8 @@ static void tctx_task_work(struct callback_head *cb) static void io_req_task_work_add(struct io_kiocb *req, bool priority) { struct task_struct *tsk = req->task; + struct io_ring_ctx *ctx = req->ctx; struct io_uring_task *tctx = tsk->io_uring; - enum task_work_notify_mode notify; struct io_wq_work_node *node; unsigned long flags; bool running; @@ -2665,18 +2743,11 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) if (running) return; - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. For - * all other cases, use TWA_SIGNAL unconditionally to ensure we're - * processing task_work. There's no reliable way to tell if TWA_RESUME - * will do the job. - */ - notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { - if (notify == TWA_NONE) - wake_up_process(tsk); + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method))) return; - } spin_lock_irqsave(&tctx->task_lock, flags); tctx->task_running = false; @@ -3299,6 +3370,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); req->rw.flags = READ_ONCE(sqe->rw_flags); + /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -3447,56 +3519,96 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, return __io_import_fixed(req, rw, iter, imu); } -static void io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) +static int io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) { - struct list_head *list; - - list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; - INIT_LIST_HEAD(&bl->buf_list); bl->bgid = bgid; - list_add(&bl->list, list); + if (bgid < BGID_ARRAY) + return 0; + + return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } -static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, - int bgid, unsigned int issue_flags) +static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl) { - struct io_buffer *kbuf = req->kbuf; - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - - if (req->flags & REQ_F_BUFFER_SELECTED) - return kbuf; - - io_ring_submit_lock(req->ctx, issue_flags); + if (!list_empty(&bl->buf_list)) { + struct io_buffer *kbuf; - bl = io_buffer_get_list(ctx, bgid); - if (bl && !list_empty(&bl->buf_list)) { kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); if (*len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; req->kbuf = kbuf; + req->buf_index = kbuf->bid; + return u64_to_user_ptr(kbuf->addr); + } + return NULL; +} + +static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) +{ + struct io_uring_buf_ring *br = bl->buf_ring; + struct io_uring_buf *buf; + __u32 head = bl->head; + + if (unlikely(smp_load_acquire(&br->tail) == head)) { + io_ring_submit_unlock(req->ctx, issue_flags); + return NULL; + } + + head &= bl->mask; + if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + buf = &br->bufs[head]; } else { - kbuf = ERR_PTR(-ENOBUFS); + int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); + int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1; + buf = page_address(bl->buf_pages[index]); + buf += off; } + if (*len > buf->len) + *len = buf->len; + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + req->buf_index = buf->bid; - io_ring_submit_unlock(req->ctx, issue_flags); - return kbuf; + if (issue_flags & IO_URING_F_UNLOCKED) { + /* + * If we came in unlocked, we have no choice but to consume the + * buffer here. This does mean it'll be pinned until the IO + * completes. But coming in unlocked means we're in io-wq + * context, hence there should be no further retry. For the + * locked case, the caller must ensure to call the commit when + * the transfer completes (or if we get -EAGAIN and must poll + * or retry). + */ + req->buf_list = NULL; + bl->head++; + } + return u64_to_user_ptr(buf->addr); } -static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) +static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags) { - struct io_buffer *kbuf; - u16 bgid; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + void __user *ret = NULL; - bgid = req->buf_index; - kbuf = io_buffer_select(req, len, bgid, issue_flags); - if (IS_ERR(kbuf)) - return kbuf; - return u64_to_user_ptr(kbuf->addr); + io_ring_submit_lock(req->ctx, issue_flags); + + bl = io_buffer_get_list(ctx, req->buf_index); + if (likely(bl)) { + if (bl->buf_nr_pages) + ret = io_ring_buffer_select(req, len, bl, issue_flags); + else + ret = io_provided_buffer_select(req, len, bl); + } + io_ring_submit_unlock(req->ctx, issue_flags); + return ret; } #ifdef CONFIG_COMPAT @@ -3506,7 +3618,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, struct compat_iovec __user *uiov; compat_ssize_t clen; void __user *buf; - ssize_t len; + size_t len; uiov = u64_to_user_ptr(req->rw.addr); if (!access_ok(uiov, sizeof(*uiov))) @@ -3517,11 +3629,12 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, return -EINVAL; len = clen; - buf = io_rw_buffer_select(req, &len, issue_flags); - if (IS_ERR(buf)) - return PTR_ERR(buf); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + req->rw.addr = (unsigned long) buf; iov[0].iov_base = buf; - iov[0].iov_len = (compat_size_t) len; + req->rw.len = iov[0].iov_len = (compat_size_t) len; return 0; } #endif @@ -3539,22 +3652,21 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, len = iov[0].iov_len; if (len < 0) return -EINVAL; - buf = io_rw_buffer_select(req, &len, issue_flags); - if (IS_ERR(buf)) - return PTR_ERR(buf); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + req->rw.addr = (unsigned long) buf; iov[0].iov_base = buf; - iov[0].iov_len = len; + req->rw.len = iov[0].iov_len = len; return 0; } static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - struct io_buffer *kbuf = req->kbuf; - - iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov[0].iov_len = kbuf->len; + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { + iov[0].iov_base = u64_to_user_ptr(req->rw.addr); + iov[0].iov_len = req->rw.len; return 0; } if (req->rw.len != 1) @@ -3568,6 +3680,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, issue_flags); } +static inline bool io_do_buffer_select(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return false; + return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); +} + static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, struct io_rw_state *s, unsigned int issue_flags) @@ -3586,18 +3705,15 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, return NULL; } - /* buffer index only valid with fixed read/write, or buffer select */ - if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) - return ERR_PTR(-EINVAL); - buf = u64_to_user_ptr(req->rw.addr); sqe_len = req->rw.len; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - if (req->flags & REQ_F_BUFFER_SELECT) { - buf = io_rw_buffer_select(req, &sqe_len, issue_flags); - if (IS_ERR(buf)) - return ERR_CAST(buf); + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return ERR_PTR(-ENOBUFS); + req->rw.addr = (unsigned long) buf; req->rw.len = sqe_len; } @@ -3899,6 +4015,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; + kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; @@ -4163,9 +4280,7 @@ static int io_renameat_prep(struct io_kiocb *req, struct io_rename *ren = &req->rename; const char __user *oldf, *newf; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4460,10 +4575,7 @@ static int io_unlinkat_prep(struct io_kiocb *req, struct io_unlink *un = &req->unlink; const char __user *fname; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4507,10 +4619,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, struct io_mkdir *mkd = &req->mkdir; const char __user *fname; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4548,10 +4657,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, struct io_symlink *sl = &req->symlink; const char __user *oldpath, *newpath; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4595,9 +4701,7 @@ static int io_linkat_prep(struct io_kiocb *req, struct io_hardlink *lnk = &req->hardlink; const char __user *oldf, *newf; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4642,9 +4746,7 @@ static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; @@ -4682,9 +4784,6 @@ static int __io_splice_prep(struct io_kiocb *req, struct io_splice *sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sp->len = READ_ONCE(sqe->len); sp->flags = READ_ONCE(sqe->splice_flags); if (unlikely(sp->flags & ~valid_flags)) @@ -4783,20 +4882,25 @@ done: */ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; + void __user *buf; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; + if (req->flags & REQ_F_BUFFER_SELECT) { + size_t len = 1; - __io_req_complete(req, issue_flags, 0, 0); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + } + + __io_req_complete(req, issue_flags, 0, io_put_kbuf(req, issue_flags)); return 0; } static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || - sqe->splice_fd_in || sqe->buf_index || sqe->personality)) + if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || + sqe->buf_index || sqe->personality)) return -EINVAL; req->msg.user_data = READ_ONCE(sqe->off); @@ -4832,17 +4936,15 @@ done: if (ret < 0) req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); + /* put file to avoid an attempt to IOPOLL the req */ + io_put_file(req->file); + req->file = NULL; return 0; } static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->splice_fd_in)) + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->sync.flags = READ_ONCE(sqe->fsync_flags); @@ -4873,10 +4975,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || - sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -4905,9 +5004,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4962,6 +5059,61 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_openat_prep(req, sqe); } +static int io_file_bitmap_get(struct io_ring_ctx *ctx) +{ + struct io_file_table *table = &ctx->file_table; + unsigned long nr = ctx->nr_user_files; + int ret; + + if (table->alloc_hint >= nr) + table->alloc_hint = 0; + + do { + ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); + if (ret != nr) { + table->alloc_hint = ret + 1; + return ret; + } + if (!table->alloc_hint) + break; + + nr = table->alloc_hint; + table->alloc_hint = 0; + } while (1); + + return -ENFILE; +} + +static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) +{ + bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; + struct io_ring_ctx *ctx = req->ctx; + int ret; + + if (alloc_slot) { + io_ring_submit_lock(ctx, issue_flags); + ret = io_file_bitmap_get(ctx); + if (unlikely(ret < 0)) { + io_ring_submit_unlock(ctx, issue_flags); + return ret; + } + + file_slot = ret; + } else { + file_slot--; + } + + ret = io_install_fixed_file(req, file, issue_flags, file_slot); + if (alloc_slot) { + io_ring_submit_unlock(ctx, issue_flags); + if (!ret) + return file_slot; + } + + return ret; +} + static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct open_flags op; @@ -5017,8 +5169,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) if (!fixed) fd_install(ret, file); else - ret = io_install_fixed_file(req, file, issue_flags, - req->open.file_slot - 1); + ret = io_fixed_fd_install(req, issue_flags, file, + req->open.file_slot); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -5039,7 +5191,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || sqe->splice_fd_in) return -EINVAL; @@ -5062,6 +5214,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; + if (bl->buf_nr_pages) { + int j; + + i = bl->buf_ring->tail - bl->head; + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + /* make sure it's seen as empty */ + INIT_LIST_HEAD(&bl->buf_list); + return i; + } + /* the head kbuf is the list itself */ while (!list_empty(&bl->buf_list)) { struct io_buffer *nxt; @@ -5088,8 +5254,12 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) ret = -ENOENT; bl = io_buffer_get_list(ctx, p->bgid); - if (bl) - ret = __io_remove_buffers(ctx, bl, p->nbufs); + if (bl) { + ret = -EINVAL; + /* can't use provide/remove buffers command on mapped buffers */ + if (!bl->buf_nr_pages) + ret = __io_remove_buffers(ctx, bl, p->nbufs); + } if (ret < 0) req_set_fail(req); @@ -5106,7 +5276,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -5203,6 +5373,23 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, return i ? 0 : -ENOMEM; } +static __cold int io_init_bl_list(struct io_ring_ctx *ctx) +{ + int i; + + ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), + GFP_KERNEL); + if (!ctx->io_bl) + return -ENOMEM; + + for (i = 0; i < BGID_ARRAY; i++) { + INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); + ctx->io_bl[i].bgid = i; + } + + return 0; +} + static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; @@ -5212,14 +5399,30 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); + if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { + ret = io_init_bl_list(ctx); + if (ret) + goto err; + } + bl = io_buffer_get_list(ctx, p->bgid); if (unlikely(!bl)) { - bl = kmalloc(sizeof(*bl), GFP_KERNEL); + bl = kzalloc(sizeof(*bl), GFP_KERNEL); if (!bl) { ret = -ENOMEM; goto err; } - io_buffer_add_list(ctx, bl, p->bgid); + INIT_LIST_HEAD(&bl->buf_list); + ret = io_buffer_add_list(ctx, bl, p->bgid); + if (ret) { + kfree(bl); + goto err; + } + } + /* can't add buffers via this command for a mapped buffer ring */ + if (bl->buf_nr_pages) { + ret = -EINVAL; + goto err; } ret = io_add_buffers(ctx, p, bl); @@ -5236,9 +5439,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -5282,9 +5483,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; req->madvise.addr = READ_ONCE(sqe->addr); @@ -5315,9 +5514,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; req->fadvise.offset = READ_ONCE(sqe->off); @@ -5353,9 +5550,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { const char __user *path; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -5397,10 +5592,7 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index) + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -5432,7 +5624,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) spin_unlock(&files->file_lock); goto err; } - file = fdt->fd[close->fd]; + file = rcu_dereference_protected(fdt->fd[close->fd], + lockdep_is_held(&files->file_lock)); if (!file || file->f_op == &io_uring_fops) { spin_unlock(&files->file_lock); file = NULL; @@ -5466,12 +5659,7 @@ err: static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->splice_fd_in)) + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -5547,11 +5735,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(sqe->file_index)) + return -EINVAL; + if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + sr->flags = READ_ONCE(sqe->addr2); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -5586,7 +5779,11 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - flags = req->sr_msg.msg_flags; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5628,6 +5825,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) int min_ret = 0; int ret; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; @@ -5641,7 +5842,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_controllen = 0; msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5758,14 +5959,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, return __io_recvmsg_copy_hdr(req, iomsg); } -static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - unsigned int issue_flags) -{ - struct io_sr_msg *sr = &req->sr_msg; - - return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); -} - static int io_recvmsg_prep_async(struct io_kiocb *req) { int ret; @@ -5780,12 +5973,16 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(sqe->file_index)) + return -EINVAL; + if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); - sr->bgid = READ_ONCE(sqe->buf_group); + sr->flags = READ_ONCE(sqe->addr2); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -5803,7 +6000,6 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) struct io_async_msghdr iomsg, *kmsg; struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; - struct io_buffer *kbuf; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -5821,24 +6017,29 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, issue_flags); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - kmsg->fast_iov[0].iov_len = req->sr_msg.len; - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, - 1, req->sr_msg.len); + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + kmsg->fast_iov[0].iov_base = buf; + kmsg->fast_iov[0].iov_len = sr->len; + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, + sr->len); } - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, - kmsg->uaddr, flags); + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) return io_setup_async_msg(req, kmsg); @@ -5868,28 +6069,32 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) static int io_recv(struct io_kiocb *req, unsigned int issue_flags) { - struct io_buffer *kbuf; struct io_sr_msg *sr = &req->sr_msg; struct msghdr msg; - void __user *buf = sr->buf; struct socket *sock; struct iovec iov; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, issue_flags); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - buf = u64_to_user_ptr(kbuf->addr); + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; } - ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) goto out_free; @@ -5900,7 +6105,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) msg.msg_iocb = NULL; msg.msg_flags = 0; - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5936,29 +6141,39 @@ out_free: static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = &req->accept; + unsigned flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index) + if (sqe->len || sqe->buf_index) return -EINVAL; accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); + flags = READ_ONCE(sqe->ioprio); + if (flags & ~IORING_ACCEPT_MULTISHOT) + return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) - return -EINVAL; + if (accept->file_slot) { + if (accept->flags & SOCK_CLOEXEC) + return -EINVAL; + if (flags & IORING_ACCEPT_MULTISHOT && + accept->file_slot != IORING_FILE_INDEX_ALLOC) + return -EINVAL; + } if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + if (flags & IORING_ACCEPT_MULTISHOT) + req->flags |= REQ_F_APOLL_MULTISHOT; return 0; } static int io_accept(struct io_kiocb *req, unsigned int issue_flags) { + struct io_ring_ctx *ctx = req->ctx; struct io_accept *accept = &req->accept; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -5966,6 +6181,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; +retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) @@ -5977,8 +6193,17 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) - return -EAGAIN; + if (ret == -EAGAIN && force_nonblock) { + /* + * if it's multishot and polled, we don't need to + * return EAGAIN to arm the poll infra since it + * has already been done + */ + if ((req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) + ret = 0; + return ret; + } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); @@ -5986,11 +6211,30 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) fd_install(fd, file); ret = fd; } else { - ret = io_install_fixed_file(req, file, issue_flags, - accept->file_slot - 1); + ret = io_fixed_fd_install(req, issue_flags, file, + accept->file_slot); } - __io_req_complete(req, issue_flags, ret, 0); - return 0; + + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __io_req_complete(req, issue_flags, ret, 0); + return 0; + } + if (ret >= 0) { + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, + IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) { + io_cqring_ev_posted(ctx); + goto retry; + } + ret = -ECANCELED; + } + + return ret; } static int io_connect_prep_async(struct io_kiocb *req) @@ -6005,10 +6249,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || - sqe->splice_fd_in) + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -6190,6 +6431,7 @@ static void io_poll_remove_entries(struct io_kiocb *req) rcu_read_unlock(); } +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags); /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. @@ -6198,10 +6440,10 @@ static void io_poll_remove_entries(struct io_kiocb *req) * either spurious wakeup or multishot CQE is served. 0 when it's done with * the request, then the mask is stored in req->cqe.res. */ -static int io_poll_check_events(struct io_kiocb *req, bool locked) +static int io_poll_check_events(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - int v; + int v, ret; /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) @@ -6225,23 +6467,37 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked) req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; } - /* multishot, just fill an CQE and proceed */ - if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) { - __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); + if ((unlikely(!req->cqe.res))) + continue; + if (req->apoll_events & EPOLLONESHOT) + return 0; + + /* multishot, just fill a CQE and proceed */ + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __poll_t mask = mangle_poll(req->cqe.res & + req->apoll_events); bool filled; spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask, - IORING_CQE_F_MORE); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, + mask, IORING_CQE_F_MORE); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (unlikely(!filled)) - return -ECANCELED; - io_cqring_ev_posted(ctx); - } else if (req->cqe.res) { - return 0; + if (filled) { + io_cqring_ev_posted(ctx); + continue; + } + return -ECANCELED; } + io_tw_lock(req->ctx, locked); + if (unlikely(req->task->flags & PF_EXITING)) + return -EFAULT; + ret = io_issue_sqe(req, + IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + if (ret) + return ret; + /* * Release all references, retry if someone tried to restart * task_work while we were executing it. @@ -6256,7 +6512,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req, *locked); + ret = io_poll_check_events(req, locked); if (ret > 0) return; @@ -6281,7 +6537,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req, *locked); + ret = io_poll_check_events(req, locked); if (ret > 0) return; @@ -6296,7 +6552,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask, int events) +static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) { req->cqe.res = mask; /* @@ -6315,7 +6571,8 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, int events) io_req_task_work_add(req, false); } -static inline void io_poll_execute(struct io_kiocb *req, int res, int events) +static inline void io_poll_execute(struct io_kiocb *req, int res, + __poll_t events) { if (io_poll_get_ownership(req)) __io_poll_execute(req, res, events); @@ -6330,6 +6587,7 @@ static void io_poll_cancel_req(struct io_kiocb *req) #define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) #define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) +#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) @@ -6364,7 +6622,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, } /* for instances that support it check for an event match first */ - if (mask && !(mask & poll->events)) + if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) return 0; if (io_poll_get_ownership(req)) { @@ -6521,7 +6779,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; + __poll_t mask = POLLPRI | POLLERR; int ret; if (!def->pollin && !def->pollout) @@ -6530,16 +6788,18 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_ABORTED; if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) return IO_APOLL_ABORTED; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) + mask |= EPOLLONESHOT; if (def->pollin) { - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ if ((req->opcode == IORING_OP_RECVMSG) && (req->sr_msg.msg_flags & MSG_ERRQUEUE)) - mask &= ~POLLIN; + mask &= ~EPOLLIN; } else { - mask |= POLLOUT | POLLWRNORM; + mask |= EPOLLOUT | EPOLLWRNORM; } if (def->poll_exclusive) mask |= EPOLLEXCLUSIVE; @@ -6691,9 +6951,7 @@ static int io_poll_update_prep(struct io_kiocb *req, struct io_poll_update *upd = &req->poll_update; u32 flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | @@ -6723,9 +6981,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe struct io_poll_iocb *poll = &req->poll; u32 flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) + if (sqe->buf_index || sqe->off || sqe->addr) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) @@ -6932,11 +7188,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req, { struct io_timeout_rem *tr = &req->timeout_rem; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) + if (sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; tr->ltimeout = false; @@ -7006,10 +7260,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, unsigned flags; u32 off = READ_ONCE(sqe->off); - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || - sqe->splice_fd_in) + if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; @@ -7191,11 +7442,9 @@ out: static int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->splice_fd_in) + if (sqe->off || sqe->len || sqe->splice_fd_in) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); @@ -7281,7 +7530,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->rsrc_update.offset = READ_ONCE(sqe->off); @@ -7411,7 +7660,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_req_prep_async(struct io_kiocb *req) { - if (!io_op_defs[req->opcode].needs_async_setup) + const struct io_op_def *def = &io_op_defs[req->opcode]; + + /* assign early for deferred execution for non-fixed file */ + if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) + req->file = io_file_get_normal(req, req->cqe.fd); + if (!def->needs_async_setup) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; @@ -7794,6 +8048,8 @@ fail: * wait for request slots on the block side. */ if (!needs_poll) { + if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + break; cond_resched(); continue; } @@ -7850,6 +8106,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); io_req_set_rsrc_node(req, ctx, 0); + WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap)); out: io_ring_submit_unlock(ctx, issue_flags); return file; @@ -8095,9 +8352,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) return -EINVAL; - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[opcode].buffer_select) - return -EOPNOTSUPP; + if (sqe_flags & IOSQE_BUFFER_SELECT) { + if (!io_op_defs[opcode].buffer_select) + return -EOPNOTSUPP; + req->buf_index = READ_ONCE(sqe->buf_group); + } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { @@ -8120,6 +8379,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } } + if (!io_op_defs[opcode].ioprio && sqe->ioprio) + return -EINVAL; + if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (io_op_defs[opcode].needs_file) { struct io_submit_state *state = &ctx->submit_state; @@ -8377,23 +8641,6 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) return READ_ONCE(sqd->state); } -static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) -{ - /* Tell userspace we may need a wakeup call */ - spin_lock(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); - spin_unlock(&ctx->completion_lock); -} - -static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) -{ - spin_lock(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); - spin_unlock(&ctx->completion_lock); -} - static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) { unsigned int to_submit; @@ -8509,8 +8756,8 @@ static int io_sq_thread(void *data) bool needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - io_ring_set_wakeup_flag(ctx); - + atomic_or(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); if ((ctx->flags & IORING_SETUP_IOPOLL) && !wq_list_empty(&ctx->iopoll_list)) { needs_sched = false; @@ -8521,7 +8768,7 @@ static int io_sq_thread(void *data) * Ensure the store of the wakeup flag is not * reordered with the load of the SQ tail */ - smp_mb(); + smp_mb__after_atomic(); if (io_sqring_entries(ctx)) { needs_sched = false; @@ -8535,7 +8782,8 @@ static int io_sq_thread(void *data) mutex_lock(&sqd->lock); } list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_clear_wakeup_flag(ctx); + atomic_andnot(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); } finish_wait(&sqd->wait, &wait); @@ -8545,7 +8793,7 @@ static int io_sq_thread(void *data) io_uring_cancel_generic(true, sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_set_wakeup_flag(ctx); + atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); mutex_unlock(&sqd->lock); @@ -8925,20 +9173,45 @@ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL_ACCOUNT); - return !!table->files; + if (unlikely(!table->files)) + return false; + + table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); + if (unlikely(!table->bitmap)) { + kvfree(table->files); + return false; + } + + return true; } static void io_free_file_tables(struct io_file_table *table) { kvfree(table->files); + bitmap_free(table->bitmap); table->files = NULL; + table->bitmap = NULL; +} + +static inline void io_file_bitmap_set(struct io_file_table *table, int bit) +{ + WARN_ON_ONCE(test_bit(bit, table->bitmap)); + __set_bit(bit, table->bitmap); + if (bit == table->alloc_hint) + table->alloc_hint++; +} + +static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) +{ + __clear_bit(bit, table->bitmap); + table->alloc_hint = bit; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { +#if !defined(IO_URING_SCM_ALL) int i; -#if !defined(IO_URING_SCM_ALL) for (i = 0; i < ctx->nr_user_files; i++) { struct file *file = io_file_from_index(ctx, i); @@ -8946,6 +9219,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) continue; if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) continue; + io_file_bitmap_clear(&ctx->file_table, i); fput(file); } #endif @@ -9314,12 +9588,12 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct io_fixed_file *file_slot; - if (copy_from_user(&fd, &fds[i], sizeof(fd))) { + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; goto fail; } /* allow sparse sets */ - if (fd == -1) { + if (!fds || fd == -1) { ret = -EINVAL; if (unlikely(*io_get_tag_slot(ctx->file_data, i))) goto fail; @@ -9349,6 +9623,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, } file_slot = io_fixed_file_slot(&ctx->file_table, i); io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); } io_rsrc_node_switch(ctx, NULL); @@ -9409,6 +9684,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, if (ret) goto err; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, slot_index); needs_switch = true; } @@ -9416,6 +9692,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, if (!ret) { *io_get_tag_slot(ctx->file_data, slot_index) = 0; io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); } err: if (needs_switch) @@ -9457,6 +9734,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) goto out; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, offset); io_rsrc_node_switch(ctx, ctx->file_data); ret = 0; out: @@ -9506,6 +9784,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (err) break; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, i); needs_switch = true; } if (fd != -1) { @@ -9534,6 +9813,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, } *io_get_tag_slot(data, i) = tag; io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); } } @@ -9961,30 +10241,18 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) +static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, + int *npages) { - struct io_mapped_ubuf *imu = NULL; + unsigned long start, end, nr_pages; struct vm_area_struct **vmas = NULL; struct page **pages = NULL; - unsigned long off, start, end, ubuf; - size_t size; - int ret, pret, nr_pages, i; + int i, pret, ret = -ENOMEM; - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; - return 0; - } - - ubuf = (unsigned long) iov->iov_base; - end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - *pimu = NULL; - ret = -ENOMEM; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto done; @@ -9994,10 +10262,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, if (!vmas) goto done; - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); - if (!imu) - goto done; - ret = 0; mmap_read_lock(current->mm); pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, @@ -10015,6 +10279,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, break; } } + *npages = nr_pages; } else { ret = pret < 0 ? pret : -EFAULT; } @@ -10028,14 +10293,53 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unpin_user_pages(pages, pret); goto done; } + ret = 0; +done: + kvfree(vmas); + if (ret < 0) { + kvfree(pages); + pages = ERR_PTR(ret); + } + return pages; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage) +{ + struct io_mapped_ubuf *imu = NULL; + struct page **pages = NULL; + unsigned long off; + size_t size; + int ret, nr_pages, i; + + if (!iov->iov_base) { + *pimu = ctx->dummy_ubuf; + return 0; + } + + *pimu = NULL; + ret = -ENOMEM; + + pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, + &nr_pages); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + pages = NULL; + goto done; + } + + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + if (!imu) + goto done; - ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); + ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { - unpin_user_pages(pages, pret); + unpin_user_pages(pages, nr_pages); goto done; } - off = ubuf & ~PAGE_MASK; + off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; for (i = 0; i < nr_pages; i++) { size_t vec_len; @@ -10048,8 +10352,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size -= vec_len; } /* store original address for later verification */ - imu->ubuf = ubuf; - imu->ubuf_end = ubuf + iov->iov_len; + imu->ubuf = (unsigned long) iov->iov_base; + imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; *pimu = imu; ret = 0; @@ -10057,7 +10361,6 @@ done: if (ret) kvfree(imu); kvfree(pages); - kvfree(vmas); return ret; } @@ -10116,12 +10419,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) - break; - ret = io_buffer_validate(&iov); - if (ret) - break; + if (arg) { + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; + ret = io_buffer_validate(&iov); + if (ret) + break; + } else { + memset(&iov, 0, sizeof(iov)); + } + if (!iov.iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; @@ -10260,19 +10568,19 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_destroy_buffers(struct io_ring_ctx *ctx) { + struct io_buffer_list *bl; + unsigned long index; int i; - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { - struct list_head *list = &ctx->io_buffers[i]; - - while (!list_empty(list)) { - struct io_buffer_list *bl; + for (i = 0; i < BGID_ARRAY; i++) { + if (!ctx->io_bl) + break; + __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); + } - bl = list_first_entry(list, struct io_buffer_list, list); - __io_remove_buffers(ctx, bl, -1U); - list_del(&bl->list); - kfree(bl); - } + xa_for_each(&ctx->io_bl_xa, index, bl) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + __io_remove_buffers(ctx, bl, -1U); } while (!list_empty(&ctx->io_buffers_pages)) { @@ -10381,7 +10689,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); kfree(ctx->dummy_ubuf); - kfree(ctx->io_buffers); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); kfree(ctx); } @@ -11009,7 +11318,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, ret = -EFAULT; break; } - if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) { + if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; } @@ -11623,6 +11932,25 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->user = get_uid(current_user()); /* + * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if + * COOP_TASKRUN is set, then IPIs are never needed by the app. + */ + ret = -EINVAL; + if (ctx->flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG)) + goto err; + ctx->notify_method = TWA_SIGNAL_NO_IPI; + } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { + ctx->notify_method = TWA_SIGNAL_NO_IPI; + } else { + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + goto err; + ctx->notify_method = TWA_SIGNAL; + } + + /* * This is just grabbed for accounting purposes. When a process exits, * the mm is exited and dropped before the files, hence we need to hang * on to this mm purely for the purposes of being able to unaccount @@ -11719,10 +12047,11 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL)) + IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | + IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG)) return -EINVAL; - return io_uring_create(entries, &p, params); + return io_uring_create(entries, &p, params); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, @@ -11935,14 +12264,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; - if (!rr.nr || rr.resv || rr.resv2) + if (!rr.nr || rr.resv2) + return -EINVAL; + if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) return -EINVAL; switch (type) { case IORING_RSRC_FILE: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); case IORING_RSRC_BUFFER: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); } @@ -12077,6 +12412,85 @@ err: return ret; } +static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_ring *br; + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + struct page **pages; + int nr_pages; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + if (!is_power_of_2(reg.ring_entries)) + return -EINVAL; + + if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { + int ret = io_init_bl_list(ctx); + if (ret) + return ret; + } + + bl = io_buffer_get_list(ctx, reg.bgid); + if (bl) { + /* if mapped buffer ring OR classic exists, don't allow */ + if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) + return -EEXIST; + } else { + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; + } + + pages = io_pin_pages(reg.ring_addr, + struct_size(br, bufs, reg.ring_entries), + &nr_pages); + if (IS_ERR(pages)) { + kfree(bl); + return PTR_ERR(pages); + } + + br = page_address(pages[0]); + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; + bl->nr_entries = reg.ring_entries; + bl->buf_ring = br; + bl->mask = reg.ring_entries - 1; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +} + +static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + + bl = io_buffer_get_list(ctx, reg.bgid); + if (!bl) + return -ENOENT; + if (!bl->buf_nr_pages) + return -EINVAL; + + __io_remove_buffers(ctx, bl, -1U); + if (bl->bgid >= BGID_ARRAY) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree(bl); + } + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -12102,6 +12516,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, switch (opcode) { case IORING_REGISTER_BUFFERS: + ret = -EFAULT; + if (!arg) + break; ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_BUFFERS: @@ -12111,6 +12528,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: + ret = -EFAULT; + if (!arg) + break; ret = io_sqe_files_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_FILES: @@ -12205,6 +12625,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_UNREGISTER_RING_FDS: ret = io_ringfd_unregister(ctx, arg, nr_args); break; + case IORING_REGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_pbuf_ring(ctx, arg); + break; + case IORING_UNREGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_unregister_pbuf_ring(ctx, arg); + break; default: ret = -EINVAL; break; @@ -12290,6 +12722,10 @@ static int __init io_uring_init(void) /* ->buf_index is u16 */ BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); + BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); + BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != + offsetof(struct io_uring_buf_ring, tail)); /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); @@ -12299,6 +12735,8 @@ static int __init io_uring_init(void) BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); + BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 61a8edc4ba8b..e205fde7163a 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1406,7 +1406,12 @@ static void __kernfs_remove(struct kernfs_node *kn) */ void kernfs_remove(struct kernfs_node *kn) { - struct kernfs_root *root = kernfs_root(kn); + struct kernfs_root *root; + + if (!kn) + return; + + root = kernfs_root(kn); down_write(&root->kernfs_rwsem); __kernfs_remove(kn); diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index e2d59bb5e6bb..9a16897e8dc6 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -517,7 +517,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, if (result.negated) ctx->flags &= ~NFS_MOUNT_SOFTREVAL; else - ctx->flags &= NFS_MOUNT_SOFTREVAL; + ctx->flags |= NFS_MOUNT_SOFTREVAL; break; case Opt_posix: if (result.negated) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 16106f805ffa..a79f66432bd3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -363,6 +363,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) { + fattr->pre_change_attr = version; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } +} + static void nfs4_test_and_free_stateid(struct nfs_server *server, nfs4_stateid *stateid, const struct cred *cred) @@ -6553,7 +6561,9 @@ static void nfs4_delegreturn_release(void *calldata) pnfs_roc_release(&data->lr.arg, &data->lr.res, data->res.lr_ret); if (inode) { - nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + nfs4_fattr_set_prechange(&data->fattr, + inode_peek_iversion_raw(inode)); + nfs_refresh_inode(inode, &data->fattr); nfs_iput_and_deactive(inode); } kfree(calldata); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9b32b76a9c30..a792e21c5309 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1657,6 +1657,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; + /* + * FAN_RENAME is not allowed on non-dir (for now). + * We shouldn't have allowed setting any dirent events in mask of + * non-dir, but because we always allowed it, error only if group + * was initialized with the new flag FAN_REPORT_TARGET_FID. + */ + ret = -ENOTDIR; + if (inode && !S_ISDIR(inode->i_mode) && + ((mask & FAN_RENAME) || + ((mask & FANOTIFY_DIRENT_EVENTS) && + FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID)))) + goto path_put_and_out; + /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 172c86270b31..913bef0d2a36 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -72,7 +72,7 @@ out: return 0; } -static int seq_fdinfo_open(struct inode *inode, struct file *file) +static int proc_fdinfo_access_allowed(struct inode *inode) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) if (!allowed) return -EACCES; + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + return single_open(file, seq_show, inode); } @@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) proc_fdinfo_instantiate); } +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, .read = generic_read_dir, .iterate_shared = proc_readfdinfo, .llseek = generic_file_llseek, diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 0ed4861b038f..b3d5f97f16cd 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -75,11 +75,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, if (fileident) { if (adinicb || (offset + lfi < 0)) { - memcpy(udf_get_fi_ident(sfi), fileident, lfi); + memcpy(sfi->impUse + liu, fileident, lfi); } else if (offset >= 0) { memcpy(fibh->ebh->b_data + offset, fileident, lfi); } else { - memcpy(udf_get_fi_ident(sfi), fileident, -offset); + memcpy(sfi->impUse + liu, fileident, -offset); memcpy(fibh->ebh->b_data, fileident - offset, lfi + offset); } @@ -88,11 +88,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, offset += lfi; if (adinicb || (offset + padlen < 0)) { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen); + memset(sfi->impUse + liu + lfi, 0x00, padlen); } else if (offset >= 0) { memset(fibh->ebh->b_data + offset, 0x00, padlen); } else { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset); + memset(sfi->impUse + liu + lfi, 0x00, -offset); memset(fibh->ebh->b_data, 0x00, padlen + offset); } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e1afb9e503e1..bf4e60871068 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -406,7 +406,7 @@ xfs_buf_alloc_pages( STATIC int _xfs_buf_map_pages( struct xfs_buf *bp, - uint flags) + xfs_buf_flags_t flags) { ASSERT(bp->b_flags & _XBF_PAGES); if (bp->b_page_count == 1) { @@ -868,7 +868,7 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { @@ -903,7 +903,7 @@ int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - int flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp) { int error; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index edcb6254fa6a..1ee3056ff9cf 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -22,28 +22,28 @@ struct xfs_buf; #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) -#define XBF_READ (1 << 0) /* buffer intended for reading from device */ -#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ -#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ -#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ -#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ -#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */ +#define XBF_READ (1u << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */ +#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */ /* buffer type flags for write callbacks */ -#define _XBF_INODES (1 << 16)/* inode buffer */ -#define _XBF_DQUOTS (1 << 17)/* dquot buffer */ -#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */ +#define _XBF_INODES (1u << 16)/* inode buffer */ +#define _XBF_DQUOTS (1u << 17)/* dquot buffer */ +#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ -#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ -#define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1u << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */ +#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ typedef unsigned int xfs_buf_flags_t; @@ -58,7 +58,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_INODES, "INODES" }, \ { _XBF_DQUOTS, "DQUOTS" }, \ - { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ + { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ @@ -247,11 +247,11 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, - struct xfs_buf **bpp); +int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t numblks, int flags, struct xfs_buf **bpp, - const struct xfs_buf_ops *ops); + size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); void xfs_buf_hold(struct xfs_buf *bp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9de6205fe134..39ae53efb3ab 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2594,14 +2594,13 @@ xfs_ifree_cluster( } /* - * This is called to return an inode to the inode free list. - * The inode should already be truncated to 0 length and have - * no pages associated with it. This routine also assumes that - * the inode is already a part of the transaction. + * This is called to return an inode to the inode free list. The inode should + * already be truncated to 0 length and have no pages associated with it. This + * routine also assumes that the inode is already a part of the transaction. * - * The on-disk copy of the inode will have been added to the list - * of unlinked inodes in the AGI. We need to remove the inode from - * that list atomically with respect to freeing it here. + * The on-disk copy of the inode will have been added to the list of unlinked + * inodes in the AGI. We need to remove the inode from that list atomically with + * respect to freeing it here. */ int xfs_ifree( @@ -2623,13 +2622,16 @@ xfs_ifree( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* - * Pull the on-disk inode from the AGI unlinked list. + * Free the inode first so that we guarantee that the AGI lock is going + * to be taken before we remove the inode from the unlinked list. This + * makes the AGI lock -> unlinked list modification order the same as + * used in O_TMPFILE creation. */ - error = xfs_iunlink_remove(tp, pag, ip); + error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) - goto out; + return error; - error = xfs_difree(tp, pag, ip->i_ino, &xic); + error = xfs_iunlink_remove(tp, pag, ip); if (error) goto out; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index de177842b951..0c82673238f4 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -175,7 +175,7 @@ xfs_trans_get_buf( struct xfs_buftarg *target, xfs_daddr_t blkno, int numblks, - uint flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3614c7834007..e20e7c841489 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -35,6 +35,17 @@ static inline int zonefs_zone_mgmt(struct inode *inode, lockdep_assert_held(&zi->i_truncate_mutex); + /* + * With ZNS drives, closing an explicitly open zone that has not been + * written will change the zone state to "closed", that is, the zone + * will remain active. Since this can then cause failure of explicit + * open operation on other zones if the drive active zone resources + * are exceeded, make sure that the zone does not remain active by + * resetting it. + */ + if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) + op = REQ_OP_ZONE_RESET; + trace_zonefs_zone_mgmt(inode, op); ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); @@ -1142,6 +1153,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); zi->i_wr_refcnt = 0; + zi->i_flags = 0; return &zi->i_vnode; } @@ -1293,12 +1305,13 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, inc_nlink(parent); } -static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, - enum zonefs_ztype type) +static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + enum zonefs_ztype type) { struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret = 0; inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; inode->i_mode = S_IFREG | sbi->s_perm; @@ -1323,6 +1336,22 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; + + /* + * For sequential zones, make sure that any open zone is closed first + * to ensure that the initial number of open zones is 0, in sync with + * the open zone accounting done when the mount option + * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. + */ + if (type == ZONEFS_ZTYPE_SEQ && + (zone->cond == BLK_ZONE_COND_IMP_OPEN || + zone->cond == BLK_ZONE_COND_EXP_OPEN)) { + mutex_lock(&zi->i_truncate_mutex); + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + mutex_unlock(&zi->i_truncate_mutex); + } + + return ret; } static struct dentry *zonefs_create_inode(struct dentry *parent, @@ -1332,6 +1361,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, struct inode *dir = d_inode(parent); struct dentry *dentry; struct inode *inode; + int ret; dentry = d_alloc_name(parent, name); if (!dentry) @@ -1342,10 +1372,16 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, goto dput; inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; - if (zone) - zonefs_init_file_inode(inode, zone, type); - else + if (zone) { + ret = zonefs_init_file_inode(inode, zone, type); + if (ret) { + iput(inode); + goto dput; + } + } else { zonefs_init_dir_inode(dir, inode, type); + } + d_add(dentry, inode); dir->i_size++; |