summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/fs_probe.c11
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/main.c3
-rw-r--r--fs/afs/server.c3
-rw-r--r--fs/block_dev.c17
-rw-r--r--fs/btrfs/block-group.c44
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/file.c46
-rw-r--r--fs/btrfs/inode.c39
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/tree-log.c5
-rw-r--r--fs/erofs/zdata.h20
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/io-wq.c108
-rw-r--r--fs/io-wq.h4
-rw-r--r--fs/io_uring.c177
-rw-r--r--fs/proc/bootconfig.c15
-rw-r--r--fs/proc/kcore.c3
19 files changed, 328 insertions, 176 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 3e3c2bf0a722..96757f3abd74 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,7 +845,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
* to FS.FetchStatus for op->file[1].
*/
op->fetch_status.which = 1;
- op->ops = &afs_fetch_status_operation;
+ op->ops = &afs_lookup_fetch_status_operation;
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index b34f74b0f319..5d9ef517cf81 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -314,7 +314,7 @@ void afs_fs_probe_timer(struct timer_list *timer)
{
struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer);
- if (!queue_work(afs_wq, &net->fs_prober))
+ if (!net->live || !queue_work(afs_wq, &net->fs_prober))
afs_dec_servers_outstanding(net);
}
@@ -458,3 +458,12 @@ dont_wait:
return -ETIME;
return -EDESTADDRREQ;
}
+
+/*
+ * Clean up the probing when the namespace is killed off.
+ */
+void afs_fs_probe_cleanup(struct afs_net *net)
+{
+ if (del_timer_sync(&net->fs_probe_timer))
+ afs_dec_servers_outstanding(net);
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 573a5922c3bb..d520535ddb62 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1065,6 +1065,7 @@ extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *);
extern int afs_wait_for_one_fs_probe(struct afs_server *, bool);
+extern void afs_fs_probe_cleanup(struct afs_net *);
/*
* inode.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 9c79c91e8005..31b472f7c734 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -100,6 +100,7 @@ static int __net_init afs_net_init(struct net *net_ns)
timer_setup(&net->fs_timer, afs_servers_timer, 0);
INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher);
timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0);
+ atomic_set(&net->servers_outstanding, 1);
ret = -ENOMEM;
sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL);
@@ -130,6 +131,7 @@ static int __net_init afs_net_init(struct net *net_ns)
error_open_socket:
net->live = false;
+ afs_fs_probe_cleanup(net);
afs_cell_purge(net);
afs_purge_servers(net);
error_cell_init:
@@ -150,6 +152,7 @@ static void __net_exit afs_net_exit(struct net *net_ns)
struct afs_net *net = afs_net(net_ns);
net->live = false;
+ afs_fs_probe_cleanup(net);
afs_cell_purge(net);
afs_purge_servers(net);
afs_close_socket(net);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 039e3488511c..e82e452e2612 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -605,11 +605,12 @@ void afs_purge_servers(struct afs_net *net)
_enter("");
if (del_timer_sync(&net->fs_timer))
- atomic_dec(&net->servers_outstanding);
+ afs_dec_servers_outstanding(net);
afs_queue_server_manager(net);
_debug("wait");
+ atomic_dec(&net->servers_outstanding);
wait_var_event(&net->servers_outstanding,
!atomic_read(&net->servers_outstanding));
_leave("");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47860e589388..0ae656e022fd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -75,7 +75,7 @@ static void bdev_write_inode(struct block_device *bdev)
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
-void kill_bdev(struct block_device *bdev)
+static void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
@@ -84,8 +84,7 @@ void kill_bdev(struct block_device *bdev)
invalidate_bh_lrus();
truncate_inode_pages(mapping, 0);
-}
-EXPORT_SYMBOL(kill_bdev);
+}
/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
@@ -1565,10 +1564,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
*/
if (!for_part) {
ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0) {
- bdput(bdev);
+ if (ret != 0)
return ret;
- }
}
restart:
@@ -1637,8 +1634,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
BUG_ON(for_part);
ret = __blkdev_get(whole, mode, 1);
- if (ret)
+ if (ret) {
+ bdput(whole);
goto out_clear;
+ }
bdev->bd_contains = whole;
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
@@ -1688,7 +1687,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk_unblock_events(disk);
put_disk_and_module(disk);
out:
- bdput(bdev);
return ret;
}
@@ -1755,6 +1753,9 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
bdput(whole);
}
+ if (res)
+ bdput(bdev);
+
return res;
}
EXPORT_SYMBOL(blkdev_get);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 176e8a292fd1..c037ef514b64 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -940,7 +940,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out_put_group;
+ goto out;
}
/*
@@ -978,7 +978,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(inode);
- goto out_put_group;
+ goto out;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -1001,13 +1001,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0)
- goto out_put_group;
+ goto out;
if (ret > 0)
btrfs_release_path(path);
if (ret == 0) {
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
- goto out_put_group;
+ goto out;
btrfs_release_path(path);
}
@@ -1016,6 +1016,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
+ /* Once for the block groups rbtree */
+ btrfs_put_block_group(block_group);
+
if (fs_info->first_logical_byte == block_group->start)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
@@ -1089,6 +1092,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->space_info->lock);
+ /*
+ * Remove the free space for the block group from the free space tree
+ * and the block group's item from the extent tree before marking the
+ * block group as removed. This is to prevent races with tasks that
+ * freeze and unfreeze a block group, this task and another task
+ * allocating a new block group - the unfreeze task ends up removing
+ * the block group's extent map before the task calling this function
+ * deletes the block group item from the extent tree, allowing for
+ * another task to attempt to create another block group with the same
+ * item key (and failing with -EEXIST and a transaction abort).
+ */
+ ret = remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_item(trans, path, block_group);
+ if (ret < 0)
+ goto out;
+
mutex_lock(&fs_info->chunk_mutex);
spin_lock(&block_group->lock);
block_group->removed = 1;
@@ -1123,17 +1145,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
mutex_unlock(&fs_info->chunk_mutex);
- ret = remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out_put_group;
-
- /* Once for the block groups rbtree */
- btrfs_put_block_group(block_group);
-
- ret = remove_block_group_item(trans, path, block_group);
- if (ret < 0)
- goto out;
-
if (remove_em) {
struct extent_map_tree *em_tree;
@@ -1145,10 +1156,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_extent_map(em);
}
-out_put_group:
+out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
-out:
if (remove_rsv)
btrfs_delayed_refs_rsv_release(fs_info, 1);
btrfs_free_path(path);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 30ce7039bc27..d404cce8ae40 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1009,6 +1009,8 @@ enum {
BTRFS_ROOT_DEAD_RELOC_TREE,
/* Mark dead root stored on device whose cleanup needs to be resumed */
BTRFS_ROOT_DEAD_TREE,
+ /* The root has a log tree. Used only for subvolume roots. */
+ BTRFS_ROOT_HAS_LOG_TREE,
};
/*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2c14312b05e8..2520605afc25 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1533,7 +1533,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
+ size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1541,27 +1541,43 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
u64 num_bytes;
int ret;
- if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
+ if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
+ num_bytes = lockend - lockstart + 1;
- btrfs_lock_and_flush_ordered_range(inode, lockstart,
- lockend, NULL);
+ if (nowait) {
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
+ return -EAGAIN;
+
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ num_bytes);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ } else {
+ btrfs_lock_and_flush_ordered_range(inode, lockstart,
+ lockend, NULL);
+ }
- num_bytes = lockend - lockstart + 1;
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ if (!nowait)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
}
-
+out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend);
return ret;
@@ -1633,7 +1649,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) &&
check_can_nocow(BTRFS_I(inode), pos,
- &write_bytes) > 0) {
+ &write_bytes, false) > 0) {
/*
* For nodata cow case, no need to reserve
* data space.
@@ -1904,13 +1920,25 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
pos = iocb->ki_pos;
count = iov_iter_count(from);
if (iocb->ki_flags & IOCB_NOWAIT) {
+ size_t nocow_bytes = count;
+
/*
* We will allocate space in case nodatacow is not set,
* so bail
*/
if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) ||
- check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+ check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes,
+ true) <= 0) {
+ inode_unlock(inode);
+ return -EAGAIN;
+ }
+ /*
+ * There are holes in the range or parts of the range that must
+ * be COWed (shared extents, RO block groups, etc), so just bail
+ * out.
+ */
+ if (nocow_bytes < count) {
inode_unlock(inode);
return -EAGAIN;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d04c82c88418..18d384f4af54 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -985,6 +985,7 @@ static noinline int cow_file_range(struct inode *inode,
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
+ u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
@@ -1035,10 +1036,26 @@ static noinline int cow_file_range(struct inode *inode,
btrfs_drop_extent_cache(BTRFS_I(inode), start,
start + num_bytes - 1, 0);
+ /*
+ * Relocation relies on the relocated extents to have exactly the same
+ * size as the original extents. Normally writeback for relocation data
+ * extents follows a NOCOW path because relocation preallocates the
+ * extents. However, due to an operation such as scrub turning a block
+ * group to RO mode, it may fallback to COW mode, so we must make sure
+ * an extent allocated during COW has exactly the requested size and can
+ * not be split into smaller extents, otherwise relocation breaks and
+ * fails during the stage where it updates the bytenr of file extent
+ * items.
+ */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ min_alloc_size = num_bytes;
+ else
+ min_alloc_size = fs_info->sectorsize;
+
while (num_bytes > 0) {
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
- fs_info->sectorsize, 0, alloc_hint,
+ min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -1361,6 +1378,8 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
int *page_started, unsigned long *nr_written)
{
const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
+ const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
u64 range_start = start;
@@ -1391,18 +1410,23 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
* data space info, which we incremented in the step above.
*
* If we need to fallback to cow and the inode corresponds to a free
- * space cache inode, we must also increment bytes_may_use of the data
- * space_info for the same reason. Space caches always get a prealloc
+ * space cache inode or an inode of the data relocation tree, we must
+ * also increment bytes_may_use of the data space_info for the same
+ * reason. Space caches and relocated data extents always get a prealloc
* extent for them, however scrub or balance may have set the block
- * group that contains that extent to RO mode.
+ * group that contains that extent to RO mode and therefore force COW
+ * when starting writeback.
*/
count = count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0);
- if (count > 0 || is_space_ino) {
- const u64 bytes = is_space_ino ? range_bytes : count;
+ if (count > 0 || is_space_ino || is_reloc_ino) {
+ u64 bytes = count;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+ if (is_space_ino || is_reloc_ino)
+ bytes = range_bytes;
+
spin_lock(&sinfo->lock);
btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
spin_unlock(&sinfo->lock);
@@ -7865,9 +7889,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
}
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
offset, count);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 168deb8ef68a..e8f7c5f00894 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2692,7 +2692,7 @@ out:
btrfs_put_root(root);
out_free:
btrfs_free_path(path);
- kzfree(subvol_info);
+ kfree(subvol_info);
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 920cee312f4e..cd5348f352dd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -169,6 +169,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
if (ret)
goto out;
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
}
@@ -195,6 +196,9 @@ static int join_running_log_trans(struct btrfs_root *root)
{
int ret = -ENOENT;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+ return ret;
+
mutex_lock(&root->log_mutex);
if (root->log_root) {
ret = 0;
@@ -3303,6 +3307,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
if (root->log_root) {
free_log_tree(trans, root->log_root);
root->log_root = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
}
return 0;
}
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 7824f5563a55..9b66c28b3ae9 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -144,22 +144,22 @@ static inline void z_erofs_onlinepage_init(struct page *page)
static inline void z_erofs_onlinepage_fixup(struct page *page,
uintptr_t index, bool down)
{
- unsigned long *p, o, v, id;
-repeat:
- p = &page_private(page);
- o = READ_ONCE(*p);
+ union z_erofs_onlinepage_converter u = { .v = &page_private(page) };
+ int orig, orig_index, val;
- id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
- if (id) {
+repeat:
+ orig = atomic_read(u.o);
+ orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+ if (orig_index) {
if (!index)
return;
- DBG_BUGON(id != index);
+ DBG_BUGON(orig_index != index);
}
- v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
- ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
- if (cmpxchg(p, o, v) != o)
+ val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+ ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
+ if (atomic_cmpxchg(u.o, orig, val) != orig)
goto repeat;
}
diff --git a/fs/file_table.c b/fs/file_table.c
index 656647f9575a..65603502fed6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -230,7 +230,7 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
d_set_d_op(path.dentry, &anon_ops);
path.mnt = mntget(mnt);
d_instantiate(path.dentry, inode);
- file = alloc_file(&path, flags, fops);
+ file = alloc_file(&path, flags | FMODE_NONOTIFY, fops);
if (IS_ERR(file)) {
ihold(inode);
path_put(&path);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 0b65a912b036..47c5f3aeb460 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -903,13 +903,15 @@ void io_wq_cancel_all(struct io_wq *wq)
struct io_cb_cancel_data {
work_cancel_fn *fn;
void *data;
+ int nr_running;
+ int nr_pending;
+ bool cancel_all;
};
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *match = data;
unsigned long flags;
- bool ret = false;
/*
* Hold the lock to avoid ->cur_work going out of scope, caller
@@ -920,74 +922,90 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
- ret = true;
+ match->nr_running++;
}
spin_unlock_irqrestore(&worker->lock, flags);
- return ret;
+ return match->nr_running && !match->cancel_all;
}
-static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
- struct io_cb_cancel_data *match)
+static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
+ struct io_cb_cancel_data *match)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
- bool found = false;
- /*
- * First check pending list, if we're lucky we can just remove it
- * from there. CANCEL_OK means that the work is returned as-new,
- * no completion will be posted for it.
- */
+retry:
spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
+ if (!match->fn(work, match->data))
+ continue;
- if (match->fn(work, match->data)) {
- wq_list_del(&wqe->work_list, node, prev);
- found = true;
- break;
- }
- }
- spin_unlock_irqrestore(&wqe->lock, flags);
-
- if (found) {
+ wq_list_del(&wqe->work_list, node, prev);
+ spin_unlock_irqrestore(&wqe->lock, flags);
io_run_cancel(work, wqe);
- return IO_WQ_CANCEL_OK;
+ match->nr_pending++;
+ if (!match->cancel_all)
+ return;
+
+ /* not safe to continue after unlock */
+ goto retry;
}
+ spin_unlock_irqrestore(&wqe->lock, flags);
+}
- /*
- * Now check if a free (going busy) or busy worker has the work
- * currently running. If we find it there, we'll return CANCEL_RUNNING
- * as an indication that we attempt to signal cancellation. The
- * completion will run normally in this case.
- */
+static void io_wqe_cancel_running_work(struct io_wqe *wqe,
+ struct io_cb_cancel_data *match)
+{
rcu_read_lock();
- found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
+ io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
rcu_read_unlock();
- return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data)
+ void *data, bool cancel_all)
{
struct io_cb_cancel_data match = {
- .fn = cancel,
- .data = data,
+ .fn = cancel,
+ .data = data,
+ .cancel_all = cancel_all,
};
- enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
+ /*
+ * First check pending list, if we're lucky we can just remove it
+ * from there. CANCEL_OK means that the work is returned as-new,
+ * no completion will be posted for it.
+ */
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- ret = io_wqe_cancel_work(wqe, &match);
- if (ret != IO_WQ_CANCEL_NOTFOUND)
- break;
+ io_wqe_cancel_pending_work(wqe, &match);
+ if (match.nr_pending && !match.cancel_all)
+ return IO_WQ_CANCEL_OK;
}
- return ret;
+ /*
+ * Now check if a free (going busy) or busy worker has the work
+ * currently running. If we find it there, we'll return CANCEL_RUNNING
+ * as an indication that we attempt to signal cancellation. The
+ * completion will run normally in this case.
+ */
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+
+ io_wqe_cancel_running_work(wqe, &match);
+ if (match.nr_running && !match.cancel_all)
+ return IO_WQ_CANCEL_RUNNING;
+ }
+
+ if (match.nr_running)
+ return IO_WQ_CANCEL_RUNNING;
+ if (match.nr_pending)
+ return IO_WQ_CANCEL_OK;
+ return IO_WQ_CANCEL_NOTFOUND;
}
static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
@@ -997,21 +1015,7 @@ static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
- return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
-}
-
-static bool io_wq_pid_match(struct io_wq_work *work, void *data)
-{
- pid_t pid = (pid_t) (unsigned long) data;
-
- return work->task_pid == pid;
-}
-
-enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
-{
- void *data = (void *) (unsigned long) pid;
-
- return io_wq_cancel_cb(wq, io_wq_pid_match, data);
+ return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false);
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 8e138fa88b9f..071f1a997800 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -90,7 +90,6 @@ struct io_wq_work {
const struct cred *creds;
struct fs_struct *fs;
unsigned flags;
- pid_t task_pid;
};
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
@@ -125,12 +124,11 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work)
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
-enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data);
+ void *data, bool cancel_all);
struct task_struct *io_wq_get_task(struct io_wq *wq);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 155f3d830ddb..a78201b96179 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -541,6 +541,7 @@ enum {
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_QUEUE_TIMEOUT_BIT,
REQ_F_WORK_INITIALIZED_BIT,
+ REQ_F_TASK_PINNED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -598,6 +599,8 @@ enum {
REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
+ /* req->task is refcounted */
+ REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
};
struct async_poll {
@@ -910,6 +913,21 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+static void io_get_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ return;
+ get_task_struct(req->task);
+ req->flags |= REQ_F_TASK_PINNED;
+}
+
+/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
+static void __io_put_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ put_task_struct(req->task);
+}
+
static void io_file_put_work(struct work_struct *work);
/*
@@ -1045,8 +1063,6 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
}
spin_unlock(&current->fs->lock);
}
- if (!req->work.task_pid)
- req->work.task_pid = task_pid_vnr(current);
}
static inline void io_req_work_drop_env(struct io_kiocb *req)
@@ -1087,6 +1103,7 @@ static inline void io_prep_async_work(struct io_kiocb *req,
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
+ io_req_init_async(req);
io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req);
@@ -1398,9 +1415,7 @@ static void __io_req_aux_free(struct io_kiocb *req)
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- if (req->task)
- put_task_struct(req->task);
-
+ __io_put_req_task(req);
io_req_work_drop_env(req);
}
@@ -1727,6 +1742,18 @@ static int io_put_kbuf(struct io_kiocb *req)
return cflags;
}
+static void io_iopoll_queue(struct list_head *again)
+{
+ struct io_kiocb *req;
+
+ do {
+ req = list_first_entry(again, struct io_kiocb, list);
+ list_del(&req->list);
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ } while (!list_empty(again));
+}
+
/*
* Find and free completed poll iocbs
*/
@@ -1735,12 +1762,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct req_batch rb;
struct io_kiocb *req;
+ LIST_HEAD(again);
+
+ /* order with ->result store in io_complete_rw_iopoll() */
+ smp_rmb();
rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) {
int cflags = 0;
req = list_first_entry(done, struct io_kiocb, list);
+ if (READ_ONCE(req->result) == -EAGAIN) {
+ req->iopoll_completed = 0;
+ list_move_tail(&req->list, &again);
+ continue;
+ }
list_del(&req->list);
if (req->flags & REQ_F_BUFFER_SELECTED)
@@ -1758,18 +1794,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
-}
-static void io_iopoll_queue(struct list_head *again)
-{
- struct io_kiocb *req;
-
- do {
- req = list_first_entry(again, struct io_kiocb, list);
- list_del(&req->list);
- refcount_inc(&req->refs);
- io_queue_async_work(req);
- } while (!list_empty(again));
+ if (!list_empty(&again))
+ io_iopoll_queue(&again);
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
@@ -1777,7 +1804,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
- LIST_HEAD(again);
bool spin;
int ret;
@@ -1803,13 +1829,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
break;
- if (req->result == -EAGAIN) {
- list_move_tail(&req->list, &again);
- continue;
- }
- if (!list_empty(&again))
- break;
-
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
if (ret < 0)
break;
@@ -1822,9 +1841,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
- if (!list_empty(&again))
- io_iopoll_queue(&again);
-
return ret;
}
@@ -1973,11 +1989,15 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (res != req->result)
+ if (res != -EAGAIN && res != req->result)
req_set_fail_links(req);
- req->result = res;
- if (res != -EAGAIN)
+
+ WRITE_ONCE(req->result, res);
+ /* order with io_poll_complete() checking ->result */
+ if (res != -EAGAIN) {
+ smp_wmb();
WRITE_ONCE(req->iopoll_completed, 1);
+ }
}
/*
@@ -2650,8 +2670,8 @@ copy_iov:
}
}
out_free:
- kfree(iovec);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (!(req->flags & REQ_F_NEED_CLEANUP))
+ kfree(iovec);
return ret;
}
@@ -2773,8 +2793,8 @@ copy_iov:
}
}
out_free:
- req->flags &= ~REQ_F_NEED_CLEANUP;
- kfree(iovec);
+ if (!(req->flags & REQ_F_NEED_CLEANUP))
+ kfree(iovec);
return ret;
}
@@ -4236,6 +4256,28 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
__io_queue_proc(&pt->req->apoll->poll, pt, head);
}
+static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ kthread_unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+ if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ kthread_use_mm(ctx->sqo_mm);
+ }
+
+ return 0;
+}
+
static void io_async_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
@@ -4270,11 +4312,16 @@ static void io_async_task_func(struct callback_head *cb)
if (!canceled) {
__set_current_state(TASK_RUNNING);
+ if (io_sq_thread_acquire_mm(ctx, req)) {
+ io_cqring_add_event(req, -EFAULT);
+ goto end_req;
+ }
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
} else {
io_cqring_ev_posted(ctx);
+end_req:
req_set_fail_links(req);
io_double_put_req(req);
}
@@ -4366,8 +4413,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
memcpy(&apoll->work, &req->work, sizeof(req->work));
had_io = req->io != NULL;
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4555,8 +4601,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
return 0;
}
@@ -4772,7 +4817,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
enum io_wq_cancel cancel_ret;
int ret = 0;
- cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
+ cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -5817,17 +5862,14 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->flags = 0;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
- req->task = NULL;
+ req->task = current;
req->result = 0;
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (io_op_defs[req->opcode].needs_mm && !current->mm) {
- if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
- return -EFAULT;
- kthread_use_mm(ctx->sqo_mm);
- }
+ if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ return -EFAULT;
sqe_flags = READ_ONCE(sqe->flags);
/* enforce forwards compatibility on users */
@@ -5936,16 +5978,6 @@ fail_req:
return submitted;
}
-static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
-{
- struct mm_struct *mm = current->mm;
-
- if (mm) {
- kthread_unuse_mm(mm);
- mmput(mm);
- }
-}
-
static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
@@ -7331,7 +7363,17 @@ static void io_ring_exit_work(struct work_struct *work)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
- wait_for_completion(&ctx->ref_comp);
+ /*
+ * If we're doing polled IO and end up having requests being
+ * submitted async (out-of-line), then completions can come in while
+ * we're waiting for refs to drop. We need to reap these manually,
+ * as nobody else will be looking for them.
+ */
+ while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
+ io_iopoll_reap_events(ctx);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+ }
io_ring_ctx_free(ctx);
}
@@ -7365,9 +7407,22 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
+static bool io_wq_files_match(struct io_wq_work *work, void *data)
+{
+ struct files_struct *files = data;
+
+ return work->files == files;
+}
+
static void io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
+ if (list_empty_careful(&ctx->inflight_list))
+ return;
+
+ /* cancel all at once, should be faster than doing it one by one*/
+ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
+
while (!list_empty_careful(&ctx->inflight_list)) {
struct io_kiocb *cancel_req = NULL, *req;
DEFINE_WAIT(wait);
@@ -7423,6 +7478,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
}
}
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct task_struct *task = data;
+
+ return req->task == task;
+}
+
static int io_uring_flush(struct file *file, void *data)
{
struct io_ring_ctx *ctx = file->private_data;
@@ -7433,7 +7496,7 @@ static int io_uring_flush(struct file *file, void *data)
* If the task is going away, cancel work it may have pending
*/
if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
return 0;
}
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 9955d75c0585..ad31ec4ad627 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -26,8 +26,9 @@ static int boot_config_proc_show(struct seq_file *m, void *v)
static int __init copy_xbc_key_value_list(char *dst, size_t size)
{
struct xbc_node *leaf, *vnode;
- const char *val;
char *key, *end = dst + size;
+ const char *val;
+ char q;
int ret = 0;
key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
@@ -41,16 +42,20 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
break;
dst += ret;
vnode = xbc_node_get_child(leaf);
- if (vnode && xbc_node_is_array(vnode)) {
+ if (vnode) {
xbc_array_for_each_value(vnode, val) {
- ret = snprintf(dst, rest(dst, end), "\"%s\"%s",
- val, vnode->next ? ", " : "\n");
+ if (strchr(val, '"'))
+ q = '\'';
+ else
+ q = '"';
+ ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
+ q, val, q, vnode->next ? ", " : "\n");
if (ret < 0)
goto out;
dst += ret;
}
} else {
- ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val);
+ ret = snprintf(dst, rest(dst, end), "\"\"\n");
if (ret < 0)
break;
dst += ret;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 8ba492d44e68..e502414b3556 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -512,7 +512,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
* Using bounce buffer to bypass the
* hardened user copy kernel text checks.
*/
- if (probe_kernel_read(buf, (void *) start, tsz)) {
+ if (copy_from_kernel_nofault(buf, (void *)start,
+ tsz)) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;