summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/file.c1
-rw-r--r--fs/afs/cell.c9
-rw-r--r--fs/afs/dir.c66
-rw-r--r--fs/afs/dir_silly.c38
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/afs/fs_operation.c14
-rw-r--r--fs/afs/fs_probe.c11
-rw-r--r--fs/afs/inode.c91
-rw-r--r--fs/afs/internal.h39
-rw-r--r--fs/afs/main.c3
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/security.c2
-rw-r--r--fs/afs/server.c3
-rw-r--r--fs/afs/write.c13
-rw-r--r--fs/afs/yfsclient.c93
-rw-r--r--fs/aio.c2
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/block_dev.c324
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/block-group.c255
-rw-r--r--fs/btrfs/block-group.h3
-rw-r--r--fs/btrfs/btrfs_inode.h11
-rw-r--r--fs/btrfs/check-integrity.c27
-rw-r--r--fs/btrfs/compression.c30
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c19
-rw-r--r--fs/btrfs/ctree.h129
-rw-r--r--fs/btrfs/delalloc-space.c36
-rw-r--r--fs/btrfs/delalloc-space.h10
-rw-r--r--fs/btrfs/discard.c1
-rw-r--r--fs/btrfs/disk-io.c113
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-io-tree.h5
-rw-r--r--fs/btrfs/extent-tree.c17
-rw-r--r--fs/btrfs/extent_io.c286
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c184
-rw-r--r--fs/btrfs/free-space-cache.c23
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c597
-rw-r--r--fs/btrfs/ioctl.c88
-rw-r--r--fs/btrfs/ordered-data.c63
-rw-r--r--fs/btrfs/ordered-data.h19
-rw-r--r--fs/btrfs/qgroup.c359
-rw-r--r--fs/btrfs/qgroup.h24
-rw-r--r--fs/btrfs/raid56.c65
-rw-r--r--fs/btrfs/ref-verify.c4
-rw-r--r--fs/btrfs/reflink.c26
-rw-r--r--fs/btrfs/relocation.c71
-rw-r--r--fs/btrfs/scrub.c153
-rw-r--r--fs/btrfs/space-info.c4
-rw-r--r--fs/btrfs/super.c148
-rw-r--r--fs/btrfs/sysfs.c163
-rw-r--r--fs/btrfs/sysfs.h7
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c2
-rw-r--r--fs/btrfs/tests/inode-tests.c14
-rw-r--r--fs/btrfs/transaction.c8
-rw-r--r--fs/btrfs/transaction.h28
-rw-r--r--fs/btrfs/tree-defrag.c5
-rw-r--r--fs/btrfs/tree-log.c55
-rw-r--r--fs/btrfs/volumes.c141
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/buffer.c11
-rw-r--r--fs/cachefiles/rdwr.c2
-rw-r--r--fs/cifs/cifs_debug.c6
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/connect.c10
-rw-r--r--fs/cifs/file.c30
-rw-r--r--fs/cifs/inode.c9
-rw-r--r--fs/cifs/ioctl.c9
-rw-r--r--fs/cifs/misc.c16
-rw-r--r--fs/cifs/smb2misc.c8
-rw-r--r--fs/cifs/smb2ops.c14
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/crypto/Kconfig8
-rw-r--r--fs/crypto/Makefile1
-rw-r--r--fs/crypto/bio.c51
-rw-r--r--fs/crypto/crypto.c4
-rw-r--r--fs/crypto/fname.c45
-rw-r--r--fs/crypto/fscrypt_private.h144
-rw-r--r--fs/crypto/inline_crypt.c367
-rw-r--r--fs/crypto/keyring.c21
-rw-r--r--fs/crypto/keysetup.c91
-rw-r--r--fs/crypto/keysetup_v1.c20
-rw-r--r--fs/crypto/policy.c20
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/efivarfs/file.c7
-rw-r--r--fs/efivarfs/super.c6
-rw-r--r--fs/efs/super.c1
-rw-r--r--fs/erofs/data.c4
-rw-r--r--fs/erofs/zdata.c2
-rw-r--r--fs/erofs/zdata.h20
-rw-r--r--fs/exfat/dir.c14
-rw-r--r--fs/exfat/exfat_fs.h3
-rw-r--r--fs/exfat/file.c21
-rw-r--r--fs/exfat/namei.c14
-rw-r--r--fs/exfat/nls.c8
-rw-r--r--fs/exfat/super.c10
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h27
-rw-r--r--fs/ext4/extents.c2
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c30
-rw-r--r--fs/ext4/ioctl.c65
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/page-io.c6
-rw-r--r--fs/ext4/readpage.c11
-rw-r--r--fs/ext4/super.c136
-rw-r--r--fs/ext4/verity.c5
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/ext4/xattr_hurd.c51
-rw-r--r--fs/f2fs/compress.c2
-rw-r--r--fs/f2fs/data.c83
-rw-r--r--fs/f2fs/super.c35
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c2
-rw-r--r--fs/fuse/file.c134
-rw-r--r--fs/fuse/inode.c19
-rw-r--r--fs/gfs2/aops.c47
-rw-r--r--fs/gfs2/bmap.c2
-rw-r--r--fs/gfs2/file.c52
-rw-r--r--fs/gfs2/glock.c5
-rw-r--r--fs/gfs2/glops.c10
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c3
-rw-r--r--fs/gfs2/log.c25
-rw-r--r--fs/gfs2/log.h4
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/ops_fstype.c13
-rw-r--r--fs/gfs2/recovery.c4
-rw-r--r--fs/gfs2/super.c20
-rw-r--r--fs/hfs/inode.c1
-rw-r--r--fs/hfsplus/unicode.c2
-rw-r--r--fs/internal.h17
-rw-r--r--fs/io-wq.c122
-rw-r--r--fs/io-wq.h15
-rw-r--r--fs/io_uring.c2687
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/isofs/namei.c4
-rw-r--r--fs/jbd2/journal.c17
-rw-r--r--fs/jffs2/erase.c2
-rw-r--r--fs/jffs2/nodelist.h2
-rw-r--r--fs/jffs2/summary.h4
-rw-r--r--fs/jfs/jfs_mount.c1
-rw-r--r--fs/jfs/resize.c1
-rw-r--r--fs/locks.c1
-rw-r--r--fs/namespace.c1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c11
-rw-r--r--fs/nfs/nfs4namespace.c1
-rw-r--r--fs/nfs/nfs4proc.c22
-rw-r--r--fs/nfsd/nfs4state.c28
-rw-r--r--fs/nfsd/nfsctl.c25
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/vfs.c6
-rw-r--r--fs/ntfs/dir.c1
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/ocfs2/dir.c14
-rw-r--r--fs/ocfs2/dlmglue.c17
-rw-r--r--fs/ocfs2/extent_map.c4
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/suballoc.c9
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/overlayfs/copy_up.c4
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c10
-rw-r--r--fs/overlayfs/namei.c15
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/super.c73
-rw-r--r--fs/proc/bootconfig.c15
-rw-r--r--fs/proc/devices.c1
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/proc_sysctl.c6
-rw-r--r--fs/pstore/platform.c5
-rw-r--r--fs/quota/dquot.c1
-rw-r--r--fs/read_write.c131
-rw-r--r--fs/reiserfs/procfs.c1
-rw-r--r--fs/squashfs/block.c2
-rw-r--r--fs/squashfs/squashfs_fs.h16
-rw-r--r--fs/ubifs/commit.c6
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c4
-rw-r--r--fs/ubifs/journal.c4
-rw-r--r--fs/ubifs/lpt.c2
-rw-r--r--fs/ubifs/tnc.c6
-rw-r--r--fs/ubifs/tnc_misc.c4
-rw-r--r--fs/udf/balloc.c2
-rw-r--r--fs/userfaultfd.c39
-rw-r--r--fs/verity/open.c15
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/xfs/xfs_log_cil.c10
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_pwork.c2
-rw-r--r--fs/zonefs/super.c18
208 files changed, 5780 insertions, 3437 deletions
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index a3cc8ecb50da..d553bb5bc17a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/user_namespace.h>
+#include <linux/blkdev.h>
#include "adfs.h"
#include "dir_f.h"
#include "dir_fplus.h"
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a85817f54483..a26a0f96c119 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -14,6 +14,7 @@
*/
#include <linux/uio.h>
+#include <linux/blkdev.h>
#include "affs.h"
static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 005921e3b38d..5b79cdceefa0 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -154,10 +154,17 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
return ERR_PTR(-ENOMEM);
}
+ cell->name = kmalloc(namelen + 1, GFP_KERNEL);
+ if (!cell->name) {
+ kfree(cell);
+ return ERR_PTR(-ENOMEM);
+ }
+
cell->net = net;
cell->name_len = namelen;
for (i = 0; i < namelen; i++)
cell->name[i] = tolower(name[i]);
+ cell->name[i] = 0;
atomic_set(&cell->usage, 2);
INIT_WORK(&cell->manager, afs_manage_cell);
@@ -207,6 +214,7 @@ parse_failed:
if (ret == -EINVAL)
printk(KERN_ERR "kAFS: bad VL server IP address\n");
error:
+ kfree(cell->name);
kfree(cell);
_leave(" = %d", ret);
return ERR_PTR(ret);
@@ -489,6 +497,7 @@ static void afs_cell_destroy(struct rcu_head *rcu)
afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers));
afs_put_cell(cell->net, cell->alias_of);
key_put(cell->anonymous_key);
+ kfree(cell->name);
kfree(cell);
_leave(" [destroyed]");
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index aa1d34141ea3..1d2e61e0ab04 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -648,7 +648,7 @@ static void afs_do_lookup_success(struct afs_operation *op)
vp = &op->file[0];
abort_code = vp->scb.status.abort_code;
if (abort_code != 0) {
- op->abort_code = abort_code;
+ op->ac.abort_code = abort_code;
op->error = afs_abort_to_error(abort_code);
}
break;
@@ -696,10 +696,11 @@ static const struct afs_operation_ops afs_inline_bulk_status_operation = {
.success = afs_do_lookup_success,
};
-static const struct afs_operation_ops afs_fetch_status_operation = {
+static const struct afs_operation_ops afs_lookup_fetch_status_operation = {
.issue_afs_rpc = afs_fs_fetch_status,
.issue_yfs_rpc = yfs_fs_fetch_status,
.success = afs_do_lookup_success,
+ .aborted = afs_check_for_remote_deletion,
};
/*
@@ -844,7 +845,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
* to FS.FetchStatus for op->file[1].
*/
op->fetch_status.which = 1;
- op->ops = &afs_fetch_status_operation;
+ op->ops = &afs_lookup_fetch_status_operation;
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
@@ -1052,7 +1053,7 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct afs_vnode *vnode, *dir;
- struct afs_fid uninitialized_var(fid);
+ struct afs_fid fid;
struct dentry *parent;
struct inode *inode;
struct key *key;
@@ -1236,6 +1237,17 @@ void afs_d_release(struct dentry *dentry)
_enter("%pd", dentry);
}
+void afs_check_for_remote_deletion(struct afs_operation *op)
+{
+ struct afs_vnode *vnode = op->file[0].vnode;
+
+ switch (op->ac.abort_code) {
+ case VNOVNODE:
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ afs_break_callback(vnode, afs_cb_break_for_deleted);
+ }
+}
+
/*
* Create a new inode for create/mkdir/symlink
*/
@@ -1268,7 +1280,7 @@ static void afs_vnode_new_inode(struct afs_operation *op)
static void afs_create_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, op->file[0].vnode);
+ op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
afs_update_dentry_version(op, &op->file[0], op->dentry);
afs_vnode_new_inode(op);
@@ -1302,6 +1314,7 @@ static const struct afs_operation_ops afs_mkdir_operation = {
.issue_afs_rpc = afs_fs_make_dir,
.issue_yfs_rpc = yfs_fs_make_dir,
.success = afs_create_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_create_edit_dir,
.put = afs_create_put,
};
@@ -1325,6 +1338,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
op->dentry = dentry;
op->create.mode = S_IFDIR | mode;
op->create.reason = afs_edit_dir_for_mkdir;
@@ -1350,7 +1364,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
static void afs_rmdir_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, op->file[0].vnode);
+ op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
afs_update_dentry_version(op, &op->file[0], op->dentry);
}
@@ -1382,6 +1396,7 @@ static const struct afs_operation_ops afs_rmdir_operation = {
.issue_afs_rpc = afs_fs_remove_dir,
.issue_yfs_rpc = yfs_fs_remove_dir,
.success = afs_rmdir_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_rmdir_edit_dir,
.put = afs_rmdir_put,
};
@@ -1404,6 +1419,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
op->dentry = dentry;
op->ops = &afs_rmdir_operation;
@@ -1479,7 +1495,8 @@ static void afs_dir_remove_link(struct afs_operation *op)
static void afs_unlink_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, op->file[0].vnode);
+ op->ctime = op->file[0].scb.status.mtime_client;
+ afs_check_dir_conflict(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[1]);
afs_update_dentry_version(op, &op->file[0], op->dentry);
@@ -1511,6 +1528,7 @@ static const struct afs_operation_ops afs_unlink_operation = {
.issue_afs_rpc = afs_fs_remove_file,
.issue_yfs_rpc = yfs_fs_remove_file,
.success = afs_unlink_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_unlink_edit_dir,
.put = afs_unlink_put,
};
@@ -1537,6 +1555,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
/* Try to make sure we have a callback promise on the victim. */
ret = afs_validate(vnode, op->key);
@@ -1561,9 +1580,25 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
op->file[1].vnode = vnode;
+ op->file[1].update_ctime = true;
+ op->file[1].op_unlinked = true;
op->dentry = dentry;
op->ops = &afs_unlink_operation;
- return afs_do_sync_operation(op);
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+
+ /* If there was a conflict with a third party, check the status of the
+ * unlinked vnode.
+ */
+ if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ op->file[1].update_ctime = false;
+ op->fetch_status.which = 1;
+ op->ops = &afs_fetch_status_operation;
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+ }
+
+ return afs_put_operation(op);
error:
return afs_put_operation(op);
@@ -1573,6 +1608,7 @@ static const struct afs_operation_ops afs_create_operation = {
.issue_afs_rpc = afs_fs_create_file,
.issue_yfs_rpc = yfs_fs_create_file,
.success = afs_create_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_create_edit_dir,
.put = afs_create_put,
};
@@ -1601,6 +1637,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
op->dentry = dentry;
op->create.mode = S_IFREG | mode;
@@ -1620,6 +1657,7 @@ static void afs_link_success(struct afs_operation *op)
struct afs_vnode_param *vp = &op->file[1];
_enter("op=%08x", op->debug_id);
+ op->ctime = dvp->scb.status.mtime_client;
afs_vnode_commit_status(op, dvp);
afs_vnode_commit_status(op, vp);
afs_update_dentry_version(op, dvp, op->dentry);
@@ -1640,6 +1678,7 @@ static const struct afs_operation_ops afs_link_operation = {
.issue_afs_rpc = afs_fs_link,
.issue_yfs_rpc = yfs_fs_link,
.success = afs_link_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_create_edit_dir,
.put = afs_link_put,
};
@@ -1672,6 +1711,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, vnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
+ op->file[1].update_ctime = true;
op->dentry = dentry;
op->dentry_2 = from;
@@ -1689,6 +1730,7 @@ static const struct afs_operation_ops afs_symlink_operation = {
.issue_afs_rpc = afs_fs_symlink,
.issue_yfs_rpc = yfs_fs_symlink,
.success = afs_create_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_create_edit_dir,
.put = afs_create_put,
};
@@ -1740,9 +1782,13 @@ static void afs_rename_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
+ op->ctime = op->file[0].scb.status.mtime_client;
+ afs_check_dir_conflict(op, &op->file[1]);
afs_vnode_commit_status(op, &op->file[0]);
- if (op->file[1].vnode != op->file[0].vnode)
+ if (op->file[1].vnode != op->file[0].vnode) {
+ op->ctime = op->file[1].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[1]);
+ }
}
static void afs_rename_edit_dir(struct afs_operation *op)
@@ -1860,6 +1906,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
op->file[0].dv_delta = 1;
op->file[1].dv_delta = 1;
+ op->file[0].update_ctime = true;
+ op->file[1].update_ctime = true;
op->dentry = old_dentry;
op->dentry_2 = new_dentry;
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index b14e3d9a25e2..04f75a44f243 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -16,6 +16,7 @@ static void afs_silly_rename_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
+ afs_check_dir_conflict(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[0]);
}
@@ -69,6 +70,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
return PTR_ERR(op);
afs_op_set_vnode(op, 0, dvnode);
+ afs_op_set_vnode(op, 1, dvnode);
+ op->file[0].dv_delta = 1;
+ op->file[1].dv_delta = 1;
+ op->file[0].update_ctime = true;
+ op->file[1].update_ctime = true;
op->dentry = old;
op->dentry_2 = new;
@@ -129,6 +135,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
switch (ret) {
case 0:
/* The rename succeeded. */
+ set_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags);
d_move(dentry, sdentry);
break;
case -ERESTARTSYS:
@@ -148,19 +155,11 @@ out:
static void afs_silly_unlink_success(struct afs_operation *op)
{
- struct afs_vnode *vnode = op->file[1].vnode;
-
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, op->file[0].vnode);
+ afs_check_dir_conflict(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[1]);
afs_update_dentry_version(op, &op->file[0], op->dentry);
-
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
- }
}
static void afs_silly_unlink_edit_dir(struct afs_operation *op)
@@ -181,6 +180,7 @@ static const struct afs_operation_ops afs_silly_unlink_operation = {
.issue_afs_rpc = afs_fs_remove_file,
.issue_yfs_rpc = yfs_fs_remove_file,
.success = afs_silly_unlink_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_silly_unlink_edit_dir,
};
@@ -200,12 +200,30 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, vnode);
+ op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
+ op->file[1].op_unlinked = true;
+ op->file[1].update_ctime = true;
op->dentry = dentry;
op->ops = &afs_silly_unlink_operation;
trace_afs_silly_rename(vnode, true);
- return afs_do_sync_operation(op);
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+
+ /* If there was a conflict with a third party, check the status of the
+ * unlinked vnode.
+ */
+ if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ op->file[1].update_ctime = false;
+ op->fetch_status.which = 1;
+ op->ops = &afs_fetch_status_operation;
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+ }
+
+ return afs_put_operation(op);
}
/*
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 506c47471b42..6f6ed1605cfe 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -225,7 +225,6 @@ static void afs_fetch_data_success(struct afs_operation *op)
struct afs_vnode *vnode = op->file[0].vnode;
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, vnode);
afs_vnode_commit_status(op, &op->file[0]);
afs_stat_v(vnode, n_fetches);
atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes);
@@ -240,6 +239,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
.issue_afs_rpc = afs_fs_fetch_data,
.issue_yfs_rpc = yfs_fs_fetch_data,
.success = afs_fetch_data_success,
+ .aborted = afs_check_for_remote_deletion,
.put = afs_fetch_data_put,
};
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 71eea2a908c7..ffb8575345ca 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -175,10 +175,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode)
static void afs_lock_success(struct afs_operation *op)
{
- struct afs_vnode *vnode = op->file[0].vnode;
-
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, vnode);
afs_vnode_commit_status(op, &op->file[0]);
}
@@ -186,6 +183,7 @@ static const struct afs_operation_ops afs_set_lock_operation = {
.issue_afs_rpc = afs_fs_set_lock,
.issue_yfs_rpc = yfs_fs_set_lock,
.success = afs_lock_success,
+ .aborted = afs_check_for_remote_deletion,
};
/*
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 2d2dff5688a4..24fd163c6323 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -71,7 +71,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
swap(vnode, vnode2);
if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
- op->error = -EINTR;
+ op->error = -ERESTARTSYS;
op->flags |= AFS_OPERATION_STOP;
_leave(" = f [I 0]");
return false;
@@ -80,7 +80,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
if (vnode2) {
if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
- op->error = -EINTR;
+ op->error = -ERESTARTSYS;
op->flags |= AFS_OPERATION_STOP;
mutex_unlock(&vnode->io_lock);
op->flags &= ~AFS_OPERATION_LOCK_0;
@@ -187,9 +187,17 @@ void afs_wait_for_operation(struct afs_operation *op)
op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
}
- if (op->error == 0) {
+ switch (op->error) {
+ case 0:
_debug("success");
op->ops->success(op);
+ break;
+ case -ECONNABORTED:
+ if (op->ops->aborted)
+ op->ops->aborted(op);
+ break;
+ default:
+ break;
}
afs_end_vnode_operation(op);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index b34f74b0f319..5d9ef517cf81 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -314,7 +314,7 @@ void afs_fs_probe_timer(struct timer_list *timer)
{
struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer);
- if (!queue_work(afs_wq, &net->fs_prober))
+ if (!net->live || !queue_work(afs_wq, &net->fs_prober))
afs_dec_servers_outstanding(net);
}
@@ -458,3 +458,12 @@ dont_wait:
return -ETIME;
return -EDESTADDRREQ;
}
+
+/*
+ * Clean up the probing when the namespace is killed off.
+ */
+void afs_fs_probe_cleanup(struct afs_net *net)
+{
+ if (del_timer_sync(&net->fs_probe_timer))
+ afs_dec_servers_outstanding(net);
+}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index cd0a0060950b..1d13d2e882ad 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -165,9 +165,11 @@ static void afs_apply_status(struct afs_operation *op,
{
struct afs_file_status *status = &vp->scb.status;
struct afs_vnode *vnode = vp->vnode;
+ struct inode *inode = &vnode->vfs_inode;
struct timespec64 t;
umode_t mode;
bool data_changed = false;
+ bool change_size = vp->set_size;
_enter("{%llx:%llu.%u} %s",
vp->fid.vid, vp->fid.vnode, vp->fid.unique,
@@ -186,25 +188,25 @@ static void afs_apply_status(struct afs_operation *op,
}
if (status->nlink != vnode->status.nlink)
- set_nlink(&vnode->vfs_inode, status->nlink);
+ set_nlink(inode, status->nlink);
if (status->owner != vnode->status.owner)
- vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner);
+ inode->i_uid = make_kuid(&init_user_ns, status->owner);
if (status->group != vnode->status.group)
- vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group);
+ inode->i_gid = make_kgid(&init_user_ns, status->group);
if (status->mode != vnode->status.mode) {
- mode = vnode->vfs_inode.i_mode;
+ mode = inode->i_mode;
mode &= ~S_IALLUGO;
mode |= status->mode;
- WRITE_ONCE(vnode->vfs_inode.i_mode, mode);
+ WRITE_ONCE(inode->i_mode, mode);
}
t = status->mtime_client;
- vnode->vfs_inode.i_ctime = t;
- vnode->vfs_inode.i_mtime = t;
- vnode->vfs_inode.i_atime = t;
+ inode->i_mtime = t;
+ if (vp->update_ctime)
+ inode->i_ctime = op->ctime;
if (vnode->status.data_version != status->data_version)
data_changed = true;
@@ -226,6 +228,7 @@ static void afs_apply_status(struct afs_operation *op,
} else {
set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
}
+ change_size = true;
} else if (vnode->status.type == AFS_FTYPE_DIR) {
/* Expected directory change is handled elsewhere so
* that we can locally edit the directory and save on a
@@ -233,11 +236,22 @@ static void afs_apply_status(struct afs_operation *op,
*/
if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
data_changed = false;
+ change_size = true;
}
if (data_changed) {
- inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
- afs_set_i_size(vnode, status->size);
+ inode_set_iversion_raw(inode, status->data_version);
+
+ /* Only update the size if the data version jumped. If the
+ * file is being modified locally, then we might have our own
+ * idea of what the size should be that's not the same as
+ * what's on the server.
+ */
+ if (change_size) {
+ afs_set_i_size(vnode, status->size);
+ inode->i_ctime = t;
+ inode->i_atime = t;
+ }
}
}
@@ -267,32 +281,39 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
_enter("");
- ASSERTCMP(op->error, ==, 0);
-
write_seqlock(&vnode->cb_lock);
if (vp->scb.have_error) {
+ /* A YFS server will return this from RemoveFile2 and AFS and
+ * YFS will return this from InlineBulkStatus.
+ */
if (vp->scb.status.abort_code == VNOVNODE) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
clear_nlink(&vnode->vfs_inode);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
+ op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
}
- } else {
- if (vp->scb.have_status)
- afs_apply_status(op, vp);
+ } else if (vp->scb.have_status) {
+ afs_apply_status(op, vp);
if (vp->scb.have_cb)
afs_apply_callback(op, vp);
+ } else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ drop_nlink(&vnode->vfs_inode);
+ if (vnode->vfs_inode.i_nlink == 0) {
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ __afs_break_callback(vnode, afs_cb_break_for_deleted);
+ }
}
write_sequnlock(&vnode->cb_lock);
- if (op->error == 0 && vp->scb.have_status)
+ if (vp->scb.have_status)
afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb);
}
static void afs_fetch_status_success(struct afs_operation *op)
{
- struct afs_vnode_param *vp = &op->file[0];
+ struct afs_vnode_param *vp = &op->file[op->fetch_status.which];
struct afs_vnode *vnode = vp->vnode;
int ret;
@@ -306,10 +327,11 @@ static void afs_fetch_status_success(struct afs_operation *op)
}
}
-static const struct afs_operation_ops afs_fetch_status_operation = {
+const struct afs_operation_ops afs_fetch_status_operation = {
.issue_afs_rpc = afs_fs_fetch_status,
.issue_yfs_rpc = yfs_fs_fetch_status,
.success = afs_fetch_status_success,
+ .aborted = afs_check_for_remote_deletion,
};
/*
@@ -716,6 +738,9 @@ int afs_getattr(const struct path *path, struct kstat *stat,
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
generic_fillattr(inode, stat);
+ if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
+ stat->nlink > 0)
+ stat->nlink -= 1;
} while (need_seqretry(&vnode->cb_lock, seq));
done_seqretry(&vnode->cb_lock, seq);
@@ -785,7 +810,15 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
+ struct inode *inode = &op->file[0].vnode->vfs_inode;
+
afs_vnode_commit_status(op, &op->file[0]);
+ if (op->setattr.attr->ia_valid & ATTR_SIZE) {
+ loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size;
+ if (size > i_size)
+ pagecache_isize_extended(inode, i_size, size);
+ truncate_pagecache(inode, size);
+ }
}
static const struct afs_operation_ops afs_setattr_operation = {
@@ -801,17 +834,31 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct afs_operation *op;
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+ int ret;
_enter("{%llx:%llu},{n=%pd},%x",
vnode->fid.vid, vnode->fid.vnode, dentry,
attr->ia_valid);
if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
- ATTR_MTIME))) {
+ ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET |
+ ATTR_TOUCH))) {
_leave(" = 0 [unsupported]");
return 0;
}
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (!S_ISREG(vnode->vfs_inode.i_mode))
+ return -EISDIR;
+
+ ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size);
+ if (ret)
+ return ret;
+
+ if (attr->ia_size == i_size_read(&vnode->vfs_inode))
+ attr->ia_valid &= ~ATTR_SIZE;
+ }
+
/* flush any dirty data outstanding on a regular file */
if (S_ISREG(vnode->vfs_inode.i_mode))
filemap_write_and_wait(vnode->vfs_inode.i_mapping);
@@ -825,8 +872,12 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
afs_op_set_vnode(op, 0, vnode);
op->setattr.attr = attr;
- if (attr->ia_valid & ATTR_SIZE)
+ if (attr->ia_valid & ATTR_SIZE) {
op->file[0].dv_delta = 1;
+ op->file[0].set_size = true;
+ }
+ op->ctime = attr->ia_ctime;
+ op->file[0].update_ctime = 1;
op->ops = &afs_setattr_operation;
return afs_do_sync_operation(op);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 0c9806ef2a19..792ac711985e 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -388,7 +388,7 @@ struct afs_cell {
struct afs_vlserver_list __rcu *vl_servers;
u8 name_len; /* Length of name */
- char name[64 + 1]; /* Cell name, case-flattened and NUL-padded */
+ char *name; /* Cell name, case-flattened and NUL-padded */
};
/*
@@ -634,6 +634,7 @@ struct afs_vnode {
#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */
#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */
#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
+#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */
struct list_head wb_keys; /* List of keys available for writeback */
struct list_head pending_locks; /* locks waiting to be granted */
@@ -744,8 +745,11 @@ struct afs_vnode_param {
afs_dataversion_t dv_before; /* Data version before the call */
unsigned int cb_break_before; /* cb_break + cb_s_break before the call */
u8 dv_delta; /* Expected change in data version */
- bool put_vnode; /* T if we have a ref on the vnode */
- bool need_io_lock; /* T if we need the I/O lock on this */
+ bool put_vnode:1; /* T if we have a ref on the vnode */
+ bool need_io_lock:1; /* T if we need the I/O lock on this */
+ bool update_ctime:1; /* Need to update the ctime */
+ bool set_size:1; /* Must update i_size */
+ bool op_unlinked:1; /* True if file was unlinked by op */
};
/*
@@ -766,9 +770,9 @@ struct afs_operation {
struct dentry *dentry; /* Dentry to be altered */
struct dentry *dentry_2; /* Second dentry to be altered */
struct timespec64 mtime; /* Modification time to record */
+ struct timespec64 ctime; /* Change time to set */
short nr_files; /* Number of entries in file[], more_files */
short error;
- unsigned int abort_code;
unsigned int debug_id;
unsigned int cb_v_break; /* Volume break counter before op */
@@ -837,6 +841,7 @@ struct afs_operation {
#define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */
#define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */
#define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */
+#define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */
};
/*
@@ -932,6 +937,7 @@ extern const struct address_space_operations afs_dir_aops;
extern const struct dentry_operations afs_fs_dentry_operations;
extern void afs_d_release(struct dentry *);
+extern void afs_check_for_remote_deletion(struct afs_operation *);
/*
* dir_edit.c
@@ -1059,10 +1065,13 @@ extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *);
extern int afs_wait_for_one_fs_probe(struct afs_server *, bool);
+extern void afs_fs_probe_cleanup(struct afs_net *);
/*
* inode.c
*/
+extern const struct afs_operation_ops afs_fetch_status_operation;
+
extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
extern int afs_ilookup5_test_by_fid(struct inode *, void *);
@@ -1435,7 +1444,6 @@ extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
/*
* yfsclient.c
*/
-extern void yfs_fs_fetch_file_status(struct afs_operation *);
extern void yfs_fs_fetch_data(struct afs_operation *);
extern void yfs_fs_create_file(struct afs_operation *);
extern void yfs_fs_make_dir(struct afs_operation *);
@@ -1481,15 +1489,6 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
return &vnode->vfs_inode;
}
-static inline void afs_check_for_remote_deletion(struct afs_operation *op,
- struct afs_vnode *vnode)
-{
- if (op->error == -ENOENT) {
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- afs_break_callback(vnode, afs_cb_break_for_deleted);
- }
-}
-
/*
* Note that a dentry got changed. We need to set d_fsdata to the data version
* number derived from the result of the operation. It doesn't matter if
@@ -1504,6 +1503,18 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
(void *)(unsigned long)dir_vp->scb.status.data_version;
}
+/*
+ * Check for a conflicting operation on a directory that we just unlinked from.
+ * If someone managed to sneak a link or an unlink in on the file we just
+ * unlinked, we won't be able to trust nlink on an AFS file (but not YFS).
+ */
+static inline void afs_check_dir_conflict(struct afs_operation *op,
+ struct afs_vnode_param *dvp)
+{
+ if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version)
+ op->flags |= AFS_OPERATION_DIR_CONFLICT;
+}
+
static inline int afs_io_error(struct afs_call *call, enum afs_io_error where)
{
trace_afs_io_error(call->debug_id, -EIO, where);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 9c79c91e8005..31b472f7c734 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -100,6 +100,7 @@ static int __net_init afs_net_init(struct net *net_ns)
timer_setup(&net->fs_timer, afs_servers_timer, 0);
INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher);
timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0);
+ atomic_set(&net->servers_outstanding, 1);
ret = -ENOMEM;
sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL);
@@ -130,6 +131,7 @@ static int __net_init afs_net_init(struct net *net_ns)
error_open_socket:
net->live = false;
+ afs_fs_probe_cleanup(net);
afs_cell_purge(net);
afs_purge_servers(net);
error_cell_init:
@@ -150,6 +152,7 @@ static void __net_exit afs_net_exit(struct net *net_ns)
struct afs_net *net = afs_net(net_ns);
net->live = false;
+ afs_fs_probe_cleanup(net);
afs_cell_purge(net);
afs_purge_servers(net);
afs_close_socket(net);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 52b19e9c1535..5334f1bd2bca 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -83,6 +83,7 @@ int afs_abort_to_error(u32 abort_code)
case UAENOLCK: return -ENOLCK;
case UAENOTEMPTY: return -ENOTEMPTY;
case UAELOOP: return -ELOOP;
+ case UAEOVERFLOW: return -EOVERFLOW;
case UAENOMEDIUM: return -ENOMEDIUM;
case UAEDQUOT: return -EDQUOT;
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 90d852704328..9cf3102f370c 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -399,7 +399,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
int afs_permission(struct inode *inode, int mask)
{
struct afs_vnode *vnode = AFS_FS_I(inode);
- afs_access_t uninitialized_var(access);
+ afs_access_t access;
struct key *key;
int ret = 0;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 039e3488511c..e82e452e2612 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -605,11 +605,12 @@ void afs_purge_servers(struct afs_net *net)
_enter("");
if (del_timer_sync(&net->fs_timer))
- atomic_dec(&net->servers_outstanding);
+ afs_dec_servers_outstanding(net);
afs_queue_server_manager(net);
_debug("wait");
+ atomic_dec(&net->servers_outstanding);
wait_var_event(&net->servers_outstanding,
!atomic_read(&net->servers_outstanding));
_leave("");
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 768497f82aee..a121c247d95a 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -194,11 +194,11 @@ int afs_write_end(struct file *file, struct address_space *mapping,
i_size = i_size_read(&vnode->vfs_inode);
if (maybe_i_size > i_size) {
- spin_lock(&vnode->wb_lock);
+ write_seqlock(&vnode->cb_lock);
i_size = i_size_read(&vnode->vfs_inode);
if (maybe_i_size > i_size)
i_size_write(&vnode->vfs_inode, maybe_i_size);
- spin_unlock(&vnode->wb_lock);
+ write_sequnlock(&vnode->cb_lock);
}
if (!PageUptodate(page)) {
@@ -393,6 +393,7 @@ static void afs_store_data_success(struct afs_operation *op)
{
struct afs_vnode *vnode = op->file[0].vnode;
+ op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
if (op->error == 0) {
afs_pages_written_back(vnode, op->store.first, op->store.last);
@@ -448,6 +449,7 @@ static int afs_store_data(struct address_space *mapping,
op->store.first_offset = offset;
op->store.last_to = to;
op->mtime = vnode->vfs_inode.i_mtime;
+ op->flags |= AFS_OPERATION_UNINTR;
op->ops = &afs_store_data_operation;
try_next_key:
@@ -491,6 +493,7 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
unsigned long count, priv;
unsigned n, offset, to, f, t;
pgoff_t start, first, last;
+ loff_t i_size, end;
int loop, ret;
_enter(",%lx", primary_page->index);
@@ -591,7 +594,12 @@ no_more:
first = primary_page->index;
last = first + count - 1;
+ end = (loff_t)last * PAGE_SIZE + to;
+ i_size = i_size_read(&vnode->vfs_inode);
+
_debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
+ if (end > i_size)
+ to = i_size & ~PAGE_MASK;
ret = afs_store_data(mapping, first, last, offset, to);
switch (ret) {
@@ -844,6 +852,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
vmf->page->index, priv);
SetPagePrivate(vmf->page);
set_page_private(vmf->page, priv);
+ file_update_time(file);
sb_end_pagefault(inode->i_sb);
return VM_FAULT_LOCKED;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 52d5af5fcd44..8c24fdc899e3 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -330,29 +330,6 @@ static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp,
}
/*
- * Deliver a reply that's a status, callback and volsync.
- */
-static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call)
-{
- struct afs_operation *op = call->op;
- const __be32 *bp;
- int ret;
-
- ret = afs_transfer_reply(call);
- if (ret < 0)
- return ret;
-
- /* unmarshall the reply once we've received all of it */
- bp = call->buffer;
- xdr_decode_YFSFetchStatus(&bp, call, &op->file[0].scb);
- xdr_decode_YFSCallBack(&bp, call, &op->file[0].scb);
- xdr_decode_YFSVolSync(&bp, &op->volsync);
-
- _leave(" = 0 [done]");
- return 0;
-}
-
-/*
* Deliver reply data to operations that just return a file status and a volume
* sync record.
*/
@@ -375,48 +352,6 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call)
}
/*
- * YFS.FetchStatus operation type
- */
-static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = {
- .name = "YFS.FetchStatus(vnode)",
- .op = yfs_FS_FetchStatus,
- .deliver = yfs_deliver_fs_status_cb_and_volsync,
- .destructor = afs_flat_call_destructor,
-};
-
-/*
- * Fetch the status information for a file.
- */
-void yfs_fs_fetch_file_status(struct afs_operation *op)
-{
- struct afs_vnode_param *vp = &op->file[0];
- struct afs_call *call;
- __be32 *bp;
-
- _enter(",%x,{%llx:%llu},,",
- key_serial(op->key), vp->fid.vid, vp->fid.vnode);
-
- call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchStatus_vnode,
- sizeof(__be32) * 2 +
- sizeof(struct yfs_xdr_YFSFid),
- sizeof(struct yfs_xdr_YFSFetchStatus) +
- sizeof(struct yfs_xdr_YFSCallBack) +
- sizeof(struct yfs_xdr_YFSVolSync));
- if (!call)
- return afs_op_nomem(op);
-
- /* marshall the parameters */
- bp = call->request;
- bp = xdr_encode_u32(bp, YFSFETCHSTATUS);
- bp = xdr_encode_u32(bp, 0); /* RPC flags */
- bp = xdr_encode_YFSFid(bp, &vp->fid);
- yfs_check_req(call, bp);
-
- trace_afs_make_fs_call(call, &vp->fid);
- afs_make_op_call(op, call, GFP_NOFS);
-}
-
-/*
* Deliver reply data to an YFS.FetchData64.
*/
static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
@@ -1605,12 +1540,36 @@ void yfs_fs_release_lock(struct afs_operation *op)
}
/*
+ * Deliver a reply to YFS.FetchStatus
+ */
+static int yfs_deliver_fs_fetch_status(struct afs_call *call)
+{
+ struct afs_operation *op = call->op;
+ struct afs_vnode_param *vp = &op->file[op->fetch_status.which];
+ const __be32 *bp;
+ int ret;
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_YFSFetchStatus(&bp, call, &vp->scb);
+ xdr_decode_YFSCallBack(&bp, call, &vp->scb);
+ xdr_decode_YFSVolSync(&bp, &op->volsync);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
* YFS.FetchStatus operation type
*/
static const struct afs_call_type yfs_RXYFSFetchStatus = {
.name = "YFS.FetchStatus",
.op = yfs_FS_FetchStatus,
- .deliver = yfs_deliver_fs_status_cb_and_volsync,
+ .deliver = yfs_deliver_fs_fetch_status,
.destructor = afs_flat_call_destructor,
};
@@ -1619,7 +1578,7 @@ static const struct afs_call_type yfs_RXYFSFetchStatus = {
*/
void yfs_fs_fetch_status(struct afs_operation *op)
{
- struct afs_vnode_param *vp = &op->file[0];
+ struct afs_vnode_param *vp = &op->file[op->fetch_status.which];
struct afs_call *call;
__be32 *bp;
diff --git a/fs/aio.c b/fs/aio.c
index 7ecddc2f38db..91e7cc4a9f17 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -67,7 +67,7 @@ struct aio_ring {
unsigned header_length; /* size of aio_ring */
- struct io_event io_events[0];
+ struct io_event io_events[];
}; /* 128 bytes + ring size */
/*
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index b04c528b19d3..74c886f7c51c 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi,
mutex_lock(&sbi->pipe_mutex);
while (bytes) {
- wr = __kernel_write(file, data, bytes, &file->f_pos);
+ wr = kernel_write(file, data, bytes, &file->f_pos);
if (wr <= 0)
break;
data += wr;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 64cdf4d8e424..2482032021ca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -22,6 +22,7 @@
#include <linux/cred.h>
#include <linux/exportfs.h>
#include <linux/seq_file.h>
+#include <linux/blkdev.h>
#include "befs.h"
#include "btree.h"
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47860e589388..8ae833e00443 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -75,7 +75,7 @@ static void bdev_write_inode(struct block_device *bdev)
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
-void kill_bdev(struct block_device *bdev)
+static void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
@@ -84,8 +84,7 @@ void kill_bdev(struct block_device *bdev)
invalidate_bh_lrus();
truncate_inode_pages(mapping, 0);
-}
-EXPORT_SYMBOL(kill_bdev);
+}
/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
@@ -106,16 +105,7 @@ EXPORT_SYMBOL(invalidate_bdev);
static void set_init_blocksize(struct block_device *bdev)
{
- unsigned bsize = bdev_logical_block_size(bdev);
- loff_t size = i_size_read(bdev->bd_inode);
-
- while (bsize < PAGE_SIZE) {
- if (size & bsize)
- break;
- bsize <<= 1;
- }
- bdev->bd_block_size = bsize;
- bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+ bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
}
int set_blocksize(struct block_device *bdev, int size)
@@ -129,9 +119,8 @@ int set_blocksize(struct block_device *bdev, int size)
return -EINVAL;
/* Don't change the size if it is same as current */
- if (bdev->bd_block_size != size) {
+ if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
sync_blockdev(bdev);
- bdev->bd_block_size = size;
bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
}
@@ -704,12 +693,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
- result = blk_queue_enter(bdev->bd_queue, 0);
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_READ);
- blk_queue_exit(bdev->bd_queue);
+ blk_queue_exit(bdev->bd_disk->queue);
return result;
}
@@ -740,7 +729,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
- result = blk_queue_enter(bdev->bd_queue, 0);
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
@@ -753,7 +742,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
clean_page_buffers(page);
unlock_page(page);
}
- blk_queue_exit(bdev->bd_queue);
+ blk_queue_exit(bdev->bd_disk->queue);
return result;
}
@@ -784,7 +773,6 @@ static void init_once(void *foo)
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_disks);
#endif
@@ -800,9 +788,6 @@ static void bdev_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
- spin_lock(&bdev_lock);
- list_del_init(&bdev->bd_list);
- spin_unlock(&bdev_lock);
/* Detach inode from wb early as bdi_put() may free bdi->wb */
inode_detach_wb(inode);
if (bdev->bd_bdi != &noop_backing_dev_info) {
@@ -877,8 +862,6 @@ static int bdev_set(struct inode *inode, void *data)
return 0;
}
-static LIST_HEAD(all_bdevs);
-
struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
@@ -896,7 +879,6 @@ struct block_device *bdget(dev_t dev)
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
- bdev->bd_block_size = i_blocksize(inode);
bdev->bd_part_count = 0;
bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
@@ -904,9 +886,6 @@ struct block_device *bdget(dev_t dev)
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
- spin_lock(&bdev_lock);
- list_add(&bdev->bd_list, &all_bdevs);
- spin_unlock(&bdev_lock);
unlock_new_inode(inode);
}
return bdev;
@@ -927,13 +906,14 @@ EXPORT_SYMBOL(bdgrab);
long nr_blockdev_pages(void)
{
- struct block_device *bdev;
+ struct inode *inode;
long ret = 0;
- spin_lock(&bdev_lock);
- list_for_each_entry(bdev, &all_bdevs, bd_list) {
- ret += bdev->bd_inode->i_mapping->nrpages;
- }
- spin_unlock(&bdev_lock);
+
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
+ ret += inode->i_mapping->nrpages;
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
+
return ret;
}
@@ -1035,30 +1015,28 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
}
/**
- * bd_prepare_to_claim - prepare to claim a block device
+ * bd_prepare_to_claim - claim a block device
* @bdev: block device of interest
* @whole: the whole device containing @bdev, may equal @bdev
* @holder: holder trying to claim @bdev
*
- * Prepare to claim @bdev. This function fails if @bdev is already
- * claimed by another holder and waits if another claiming is in
- * progress. This function doesn't actually claim. On successful
- * return, the caller has ownership of bd_claiming and bd_holder[s].
- *
- * CONTEXT:
- * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
- * it multiple times.
+ * Claim @bdev. This function fails if @bdev is already claimed by another
+ * holder and waits if another claiming is in progress. return, the caller
+ * has ownership of bd_claiming and bd_holder[s].
*
* RETURNS:
* 0 if @bdev can be claimed, -EBUSY otherwise.
*/
-static int bd_prepare_to_claim(struct block_device *bdev,
- struct block_device *whole, void *holder)
+int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
+ void *holder)
{
retry:
+ spin_lock(&bdev_lock);
/* if someone else claimed, fail */
- if (!bd_may_claim(bdev, whole, holder))
+ if (!bd_may_claim(bdev, whole, holder)) {
+ spin_unlock(&bdev_lock);
return -EBUSY;
+ }
/* if claiming is already in progress, wait for it to finish */
if (whole->bd_claiming) {
@@ -1069,13 +1047,15 @@ retry:
spin_unlock(&bdev_lock);
schedule();
finish_wait(wq, &wait);
- spin_lock(&bdev_lock);
goto retry;
}
/* yay, all mine */
+ whole->bd_claiming = holder;
+ spin_unlock(&bdev_lock);
return 0;
}
+EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
{
@@ -1098,78 +1078,6 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
return disk;
}
-/**
- * bd_start_claiming - start claiming a block device
- * @bdev: block device of interest
- * @holder: holder trying to claim @bdev
- *
- * @bdev is about to be opened exclusively. Check @bdev can be opened
- * exclusively and mark that an exclusive open is in progress. Each
- * successful call to this function must be matched with a call to
- * either bd_finish_claiming() or bd_abort_claiming() (which do not
- * fail).
- *
- * This function is used to gain exclusive access to the block device
- * without actually causing other exclusive open attempts to fail. It
- * should be used when the open sequence itself requires exclusive
- * access but may subsequently fail.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Pointer to the block device containing @bdev on success, ERR_PTR()
- * value on failure.
- */
-struct block_device *bd_start_claiming(struct block_device *bdev, void *holder)
-{
- struct gendisk *disk;
- struct block_device *whole;
- int partno, err;
-
- might_sleep();
-
- /*
- * @bdev might not have been initialized properly yet, look up
- * and grab the outer block device the hard way.
- */
- disk = bdev_get_gendisk(bdev, &partno);
- if (!disk)
- return ERR_PTR(-ENXIO);
-
- /*
- * Normally, @bdev should equal what's returned from bdget_disk()
- * if partno is 0; however, some drivers (floppy) use multiple
- * bdev's for the same physical device and @bdev may be one of the
- * aliases. Keep @bdev if partno is 0. This means claimer
- * tracking is broken for those devices but it has always been that
- * way.
- */
- if (partno)
- whole = bdget_disk(disk, 0);
- else
- whole = bdgrab(bdev);
-
- put_disk_and_module(disk);
- if (!whole)
- return ERR_PTR(-ENOMEM);
-
- /* prepare to claim, if successful, mark claiming in progress */
- spin_lock(&bdev_lock);
-
- err = bd_prepare_to_claim(bdev, whole, holder);
- if (err == 0) {
- whole->bd_claiming = holder;
- spin_unlock(&bdev_lock);
- return whole;
- } else {
- spin_unlock(&bdev_lock);
- bdput(whole);
- return ERR_PTR(err);
- }
-}
-EXPORT_SYMBOL(bd_start_claiming);
-
static void bd_clear_claiming(struct block_device *whole, void *holder)
{
lockdep_assert_held(&bdev_lock);
@@ -1182,14 +1090,14 @@ static void bd_clear_claiming(struct block_device *whole, void *holder)
/**
* bd_finish_claiming - finish claiming of a block device
* @bdev: block device of interest
- * @whole: whole block device (returned from bd_start_claiming())
+ * @whole: whole block device
* @holder: holder that has claimed @bdev
*
* Finish exclusive open of a block device. Mark the device as exlusively
* open by the holder and wake up all waiters for exclusive open to finish.
*/
-void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
- void *holder)
+static void bd_finish_claiming(struct block_device *bdev,
+ struct block_device *whole, void *holder)
{
spin_lock(&bdev_lock);
BUG_ON(!bd_may_claim(bdev, whole, holder));
@@ -1204,12 +1112,11 @@ void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
bd_clear_claiming(whole, holder);
spin_unlock(&bdev_lock);
}
-EXPORT_SYMBOL(bd_finish_claiming);
/**
* bd_abort_claiming - abort claiming of a block device
* @bdev: block device of interest
- * @whole: whole block device (returned from bd_start_claiming())
+ * @whole: whole block device
* @holder: holder that has claimed @bdev
*
* Abort claiming of a block device when the exclusive open failed. This can be
@@ -1369,26 +1276,6 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
#endif
/**
- * flush_disk - invalidates all buffer-cache entries on a disk
- *
- * @bdev: struct block device to be flushed
- * @kill_dirty: flag to guide handling of dirty inodes
- *
- * Invalidates all buffer-cache entries on a disk. It should be called
- * when a disk has been changed -- either by a media change or online
- * resize.
- */
-static void flush_disk(struct block_device *bdev, bool kill_dirty)
-{
- if (__invalidate_device(bdev, kill_dirty)) {
- printk(KERN_WARNING "VFS: busy inodes on changed media or "
- "resized disk %s\n",
- bdev->bd_disk ? bdev->bd_disk->disk_name : "");
- }
- bdev->bd_invalidated = 1;
-}
-
-/**
* check_disk_size_change - checks for disk size change and adjusts bdev size.
* @disk: struct gendisk to check
* @bdev: struct bdev to adjust.
@@ -1412,8 +1299,9 @@ static void check_disk_size_change(struct gendisk *disk,
disk->disk_name, bdev_size, disk_size);
}
i_size_write(bdev->bd_inode, disk_size);
- if (bdev_size > disk_size)
- flush_disk(bdev, false);
+ if (bdev_size > disk_size && __invalidate_device(bdev, false))
+ pr_warn("VFS: busy inodes on resized disk %s\n",
+ disk->disk_name);
}
bdev->bd_invalidated = 0;
}
@@ -1472,7 +1360,10 @@ int check_disk_change(struct block_device *bdev)
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return 0;
- flush_disk(bdev, true);
+ if (__invalidate_device(bdev, true))
+ pr_warn("VFS: busy inodes on changed media %s\n",
+ disk->disk_name);
+ bdev->bd_invalidated = 1;
if (bdops->revalidate_disk)
bdops->revalidate_disk(bdev->bd_disk);
return 1;
@@ -1548,13 +1439,15 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
* mutex_lock_nested(whole->bd_mutex, 1)
*/
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
+ int for_part)
{
+ struct block_device *whole = NULL, *claiming = NULL;
struct gendisk *disk;
int ret;
int partno;
int perm = 0;
- bool first_open = false;
+ bool first_open = false, unblock_events = true, need_restart;
if (mode & FMODE_READ)
perm |= MAY_READ;
@@ -1565,25 +1458,41 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
*/
if (!for_part) {
ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0) {
- bdput(bdev);
+ if (ret != 0)
return ret;
- }
}
restart:
-
+ need_restart = false;
ret = -ENXIO;
disk = bdev_get_gendisk(bdev, &partno);
if (!disk)
goto out;
+ if (partno) {
+ whole = bdget_disk(disk, 0);
+ if (!whole) {
+ ret = -ENOMEM;
+ goto out_put_disk;
+ }
+ }
+
+ if (!for_part && (mode & FMODE_EXCL)) {
+ WARN_ON_ONCE(!holder);
+ if (whole)
+ claiming = whole;
+ else
+ claiming = bdev;
+ ret = bd_prepare_to_claim(bdev, claiming, holder);
+ if (ret)
+ goto out_put_whole;
+ }
+
disk_block_events(disk);
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
first_open = true;
bdev->bd_disk = disk;
- bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
bdev->bd_partno = partno;
@@ -1596,20 +1505,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
ret = 0;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
- if (ret == -ERESTARTSYS) {
- /* Lost a race with 'disk' being
- * deleted, try again.
- * See md.c
- */
- disk_put_part(bdev->bd_part);
- bdev->bd_part = NULL;
- bdev->bd_disk = NULL;
- bdev->bd_queue = NULL;
- mutex_unlock(&bdev->bd_mutex);
- disk_unblock_events(disk);
- put_disk_and_module(disk);
- goto restart;
- }
+ /*
+ * If we lost a race with 'disk' being deleted,
+ * try again. See md.c
+ */
+ if (ret == -ERESTARTSYS)
+ need_restart = true;
}
if (!ret) {
@@ -1630,16 +1531,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (ret)
goto out_clear;
} else {
- struct block_device *whole;
- whole = bdget_disk(disk, 0);
- ret = -ENOMEM;
- if (!whole)
- goto out_clear;
BUG_ON(for_part);
- ret = __blkdev_get(whole, mode, 1);
+ ret = __blkdev_get(whole, mode, NULL, 1);
if (ret)
goto out_clear;
- bdev->bd_contains = whole;
+ bdev->bd_contains = bdgrab(whole);
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1668,28 +1564,52 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_openers++;
if (for_part)
bdev->bd_part_count++;
+ if (claiming)
+ bd_finish_claiming(bdev, claiming, holder);
+
+ /*
+ * Block event polling for write claims if requested. Any write holder
+ * makes the write_holder state stick until all are released. This is
+ * good enough and tracking individual writeable reference is too
+ * fragile given the way @mode is used in blkdev_get/put().
+ */
+ if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+ (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+ bdev->bd_write_holder = true;
+ unblock_events = false;
+ }
mutex_unlock(&bdev->bd_mutex);
- disk_unblock_events(disk);
+
+ if (unblock_events)
+ disk_unblock_events(disk);
+
/* only one opener holds refs to the module and disk */
if (!first_open)
put_disk_and_module(disk);
+ if (whole)
+ bdput(whole);
return 0;
out_clear:
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
- bdev->bd_queue = NULL;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
out_unlock_bdev:
+ if (claiming)
+ bd_abort_claiming(bdev, claiming, holder);
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
+ out_put_whole:
+ if (whole)
+ bdput(whole);
+ out_put_disk:
put_disk_and_module(disk);
+ if (need_restart)
+ goto restart;
out:
- bdput(bdev);
-
return ret;
}
@@ -1714,47 +1634,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
*/
int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- struct block_device *whole = NULL;
int res;
- WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
-
- if ((mode & FMODE_EXCL) && holder) {
- whole = bd_start_claiming(bdev, holder);
- if (IS_ERR(whole)) {
- bdput(bdev);
- return PTR_ERR(whole);
- }
- }
-
- res = __blkdev_get(bdev, mode, 0);
-
- if (whole) {
- struct gendisk *disk = whole->bd_disk;
-
- /* finish claiming */
- mutex_lock(&bdev->bd_mutex);
- if (!res)
- bd_finish_claiming(bdev, whole, holder);
- else
- bd_abort_claiming(bdev, whole, holder);
- /*
- * Block event polling for write claims if requested. Any
- * write holder makes the write_holder state stick until
- * all are released. This is good enough and tracking
- * individual writeable reference is too fragile given the
- * way @mode is used in blkdev_get/put().
- */
- if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
- (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
- bdev->bd_write_holder = true;
- disk_block_events(disk);
- }
-
- mutex_unlock(&bdev->bd_mutex);
- bdput(whole);
- }
-
+ res =__blkdev_get(bdev, mode, holder, 0);
+ if (res)
+ bdput(bdev);
return res;
}
EXPORT_SYMBOL(blkdev_get);
@@ -1850,7 +1734,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
*/
filp->f_flags |= O_LARGEFILE;
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
if (filp->f_flags & O_NDELAY)
filp->f_mode |= FMODE_NDELAY;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index d888e71e66b6..ea10f7bc99ab 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1461,6 +1461,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
+ *roots = NULL;
return ret;
}
node = ulist_next(tmp, &uiter);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 176e8a292fd1..613920c17ac1 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -65,11 +65,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
spin_lock(&fs_info->balance_lock);
target = get_restripe_target(fs_info, flags);
if (target) {
- /* Pick target profile only if it's already available */
- if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
- spin_unlock(&fs_info->balance_lock);
- return extended_to_chunk(target);
- }
+ spin_unlock(&fs_info->balance_lock);
+ return extended_to_chunk(target);
}
spin_unlock(&fs_info->balance_lock);
@@ -118,12 +115,12 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
void btrfs_get_block_group(struct btrfs_block_group *cache)
{
- atomic_inc(&cache->count);
+ refcount_inc(&cache->refs);
}
void btrfs_put_block_group(struct btrfs_block_group *cache)
{
- if (atomic_dec_and_test(&cache->count)) {
+ if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
@@ -940,7 +937,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out_put_group;
+ goto out;
}
/*
@@ -978,7 +975,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(inode);
- goto out_put_group;
+ goto out;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -1001,13 +998,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0)
- goto out_put_group;
+ goto out;
if (ret > 0)
btrfs_release_path(path);
if (ret == 0) {
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
- goto out_put_group;
+ goto out;
btrfs_release_path(path);
}
@@ -1016,6 +1013,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
+ /* Once for the block groups rbtree */
+ btrfs_put_block_group(block_group);
+
if (fs_info->first_logical_byte == block_group->start)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
@@ -1089,7 +1089,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->space_info->lock);
- mutex_lock(&fs_info->chunk_mutex);
+ /*
+ * Remove the free space for the block group from the free space tree
+ * and the block group's item from the extent tree before marking the
+ * block group as removed. This is to prevent races with tasks that
+ * freeze and unfreeze a block group, this task and another task
+ * allocating a new block group - the unfreeze task ends up removing
+ * the block group's extent map before the task calling this function
+ * deletes the block group item from the extent tree, allowing for
+ * another task to attempt to create another block group with the same
+ * item key (and failing with -EEXIST and a transaction abort).
+ */
+ ret = remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_item(trans, path, block_group);
+ if (ret < 0)
+ goto out;
+
spin_lock(&block_group->lock);
block_group->removed = 1;
/*
@@ -1121,19 +1139,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
remove_em = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- mutex_unlock(&fs_info->chunk_mutex);
-
- ret = remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out_put_group;
-
- /* Once for the block groups rbtree */
- btrfs_put_block_group(block_group);
-
- ret = remove_block_group_item(trans, path, block_group);
- if (ret < 0)
- goto out;
-
if (remove_em) {
struct extent_map_tree *em_tree;
@@ -1145,10 +1150,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_extent_map(em);
}
-out_put_group:
+out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
-out:
if (remove_rsv)
btrfs_delayed_refs_rsv_release(fs_info, 1);
btrfs_free_path(path);
@@ -1522,21 +1526,70 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
+static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_block_group_item bg;
+ struct extent_buffer *leaf;
+ int slot;
+ u64 flags;
+ int ret = 0;
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+
+ em_tree = &fs_info->mapping_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ key->objectid, key->offset);
+ return -ENOENT;
+ }
+
+ if (em->start != key->objectid || em->len != key->offset) {
+ btrfs_err(fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ key->objectid, key->offset, em->start, em->len);
+ ret = -EUCLEAN;
+ goto out_free_em;
+ }
+
+ read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_stack_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ key->objectid, key->offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ ret = -EUCLEAN;
+ }
+
+out_free_em:
+ free_extent_map(em);
+ return ret;
+}
+
static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key)
{
struct btrfs_root *root = fs_info->extent_root;
- int ret = 0;
+ int ret;
struct btrfs_key found_key;
struct extent_buffer *leaf;
- struct btrfs_block_group_item bg;
- u64 flags;
int slot;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
while (1) {
slot = path->slots[0];
@@ -1553,49 +1606,10 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- struct extent_map_tree *em_tree;
- struct extent_map *em;
-
- em_tree = &root->fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, found_key.objectid,
- found_key.offset);
- read_unlock(&em_tree->lock);
- if (!em) {
- btrfs_err(fs_info,
- "logical %llu len %llu found bg but no related chunk",
- found_key.objectid, found_key.offset);
- ret = -ENOENT;
- } else if (em->start != found_key.objectid ||
- em->len != found_key.offset) {
- btrfs_err(fs_info,
- "block group %llu len %llu mismatch with chunk %llu len %llu",
- found_key.objectid, found_key.offset,
- em->start, em->len);
- ret = -EUCLEAN;
- } else {
- read_extent_buffer(leaf, &bg,
- btrfs_item_ptr_offset(leaf, slot),
- sizeof(bg));
- flags = btrfs_stack_block_group_flags(&bg) &
- BTRFS_BLOCK_GROUP_TYPE_MASK;
-
- if (flags != (em->map_lookup->type &
- BTRFS_BLOCK_GROUP_TYPE_MASK)) {
- btrfs_err(fs_info,
-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
- found_key.objectid,
- found_key.offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK &
- em->map_lookup->type));
- ret = -EUCLEAN;
- } else {
- ret = 0;
- }
- }
- free_extent_map(em);
- goto out;
+ ret = read_bg_from_eb(fs_info, &found_key, path);
+ break;
}
+
path->slots[0]++;
}
out:
@@ -1647,19 +1661,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
return -EIO;
map = em->map_lookup;
- data_stripe_length = em->len;
+ data_stripe_length = em->orig_block_len;
io_stripe_size = map->stripe_len;
- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- data_stripe_length = div_u64(data_stripe_length,
- map->num_stripes / map->sub_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- data_stripe_length = div_u64(data_stripe_length,
- nr_data_stripes(map));
+ /* For RAID5/6 adjust to a full IO stripe length */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
io_stripe_size = map->stripe_len * nr_data_stripes(map);
- }
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
if (!buf) {
@@ -1738,25 +1745,12 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return ret;
while (nr--) {
- u64 start, len;
-
- if (logical[nr] > cache->start + cache->length)
- continue;
-
- if (logical[nr] + stripe_len <= cache->start)
- continue;
-
- start = logical[nr];
- if (start < cache->start) {
- start = cache->start;
- len = (logical[nr] + stripe_len) - start;
- } else {
- len = min_t(u64, stripe_len,
- cache->start + cache->length - start);
- }
+ u64 len = min_t(u64, stripe_len,
+ cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
- ret = btrfs_add_excluded_extent(fs_info, start, len);
+ ret = btrfs_add_excluded_extent(fs_info, logical[nr],
+ len);
if (ret) {
kfree(logical);
return ret;
@@ -1808,7 +1802,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
- atomic_set(&cache->count, 1);
+ refcount_set(&cache->refs, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
@@ -2197,54 +2191,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
return 0;
}
-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 num_devices;
- u64 stripped;
-
- /*
- * if restripe for this chunk_type is on pick target profile and
- * return, otherwise do the usual balance
- */
- stripped = get_restripe_target(fs_info, flags);
- if (stripped)
- return extended_to_chunk(stripped);
-
- num_devices = fs_info->fs_devices->rw_devices;
-
- stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
- BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
-
- if (num_devices == 1) {
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* turn raid0 into single device chunks */
- if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return stripped;
-
- /* turn mirroring into duplication */
- if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
- BTRFS_BLOCK_GROUP_RAID10))
- return stripped | BTRFS_BLOCK_GROUP_DUP;
- } else {
- /* they already had raid on here, just return */
- if (flags & stripped)
- return flags;
-
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* switch duplicated blocks with raid1 */
- if (flags & BTRFS_BLOCK_GROUP_DUP)
- return stripped | BTRFS_BLOCK_GROUP_RAID1;
-
- /* this is drive concat, leave it alone */
- }
-
- return flags;
-}
-
/*
* Mark one block group RO, can be called several times for the same block
* group.
@@ -2290,7 +2236,7 @@ again:
* If we are changing raid levels, try to allocate a
* corresponding block group with the new raid level.
*/
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
ret = btrfs_chunk_alloc(trans, alloc_flags,
CHUNK_ALLOC_FORCE);
@@ -2317,7 +2263,7 @@ again:
ret = inc_block_group_ro(cache, 0);
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
mutex_lock(&fs_info->chunk_mutex);
check_system_chunk(trans, alloc_flags);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2511,7 +2457,8 @@ again:
num_pages *= 16;
num_pages *= PAGE_SIZE;
- ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
+ num_pages);
if (ret)
goto out_put;
@@ -3382,7 +3329,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(list_empty(&block_group->dirty_list));
ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list));
- ASSERT(atomic_read(&block_group->count) == 1);
+ ASSERT(refcount_read(&block_group->refs) == 1);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
@@ -3437,7 +3384,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_unlock(&block_group->lock);
if (cleanup) {
- mutex_lock(&fs_info->chunk_mutex);
em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, block_group->start,
@@ -3445,7 +3391,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
BUG_ON(!em); /* logic error, can't happen */
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- mutex_unlock(&fs_info->chunk_mutex);
/* once for us and once for the tree */
free_extent_map(em);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index b6ee70a039c7..adfd7583a17b 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -114,8 +114,7 @@ struct btrfs_block_group {
/* For block groups in the same raid type */
struct list_head list;
- /* Usage count */
- atomic_t count;
+ refcount_t refs;
/*
* List of struct btrfs_free_clusters for this block group.
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e7d709505cb1..c47b6c6fea9f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -152,6 +152,17 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * The id/generation of the last transaction where this inode was
+ * either the source or the destination of a clone/dedupe operation.
+ * Used when logging an inode to know if there are shared extents that
+ * need special care when logging checksum items, to avoid duplicate
+ * checksum items in a log (which can lead to a corruption where we end
+ * up with missing checksum ranges after log replay).
+ * Protected by the vfs inode lock.
+ */
+ u64 last_reflink_trans;
+
+ /*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 32e11a23b47f..81a8c87a5afb 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -631,10 +631,8 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
int pass;
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
- if (NULL == selected_super) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!selected_super)
return -ENOMEM;
- }
list_for_each_entry(device, dev_head, dev_list) {
int i;
@@ -795,7 +793,6 @@ static int btrfsic_process_superblock_dev_mirror(
if (NULL == superblock_tmp) {
superblock_tmp = btrfsic_block_alloc();
if (NULL == superblock_tmp) {
- pr_info("btrfsic: error, kmalloc failed!\n");
ret = -1;
goto out;
}
@@ -921,9 +918,7 @@ static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
struct btrfsic_stack_frame *sf;
sf = kzalloc(sizeof(*sf), GFP_NOFS);
- if (NULL == sf)
- pr_info("btrfsic: alloc memory failed!\n");
- else
+ if (sf)
sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
return sf;
}
@@ -1313,7 +1308,6 @@ static int btrfsic_create_link_to_next_block(
if (NULL == l) {
l = btrfsic_block_link_alloc();
if (NULL == l) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(next_block_ctx);
*next_blockp = NULL;
return -1;
@@ -1470,7 +1464,6 @@ static int btrfsic_handle_extent_data(
mirror_num,
&block_was_created);
if (NULL == next_block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&next_block_ctx);
return -1;
}
@@ -2013,7 +2006,6 @@ again:
block = btrfsic_block_alloc();
if (NULL == block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&block_ctx);
goto continue_loop;
}
@@ -2234,7 +2226,6 @@ static int btrfsic_process_written_superblock(
mirror_num,
&was_created);
if (NULL == next_block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&tmp_next_block_ctx);
return -1;
}
@@ -2542,10 +2533,8 @@ static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
&state->block_link_hashtable);
if (NULL == l) {
l = btrfsic_block_link_alloc();
- if (NULL == l) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!l)
return NULL;
- }
l->block_ref_to = next_block;
l->block_ref_from = from_block;
@@ -2589,10 +2578,9 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
struct btrfsic_dev_state *dev_state;
block = btrfsic_block_alloc();
- if (NULL == block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!block)
return NULL;
- }
+
dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
if (NULL == dev_state) {
pr_info("btrfsic: error, lookup dev_state failed!\n");
@@ -2797,10 +2785,8 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
return -1;
}
state = kvzalloc(sizeof(*state), GFP_KERNEL);
- if (!state) {
- pr_info("btrfs check-integrity: allocation failed!\n");
+ if (!state)
return -ENOMEM;
- }
if (!btrfsic_is_initialized) {
mutex_init(&btrfsic_mutex);
@@ -2829,7 +2815,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
ds = btrfsic_dev_state_alloc();
if (NULL == ds) {
- pr_info("btrfs check-integrity: kmalloc() failed!\n");
mutex_unlock(&btrfsic_mutex);
return -ENOMEM;
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6e648603f85..1ab56a734e70 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -172,18 +172,17 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
}
-static int check_compressed_csum(struct btrfs_inode *inode,
- struct compressed_bio *cb,
+static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
u64 disk_start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int ret;
struct page *page;
unsigned long i;
char *kaddr;
u8 csum[BTRFS_CSUM_SIZE];
+ struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
if (inode->flags & BTRFS_INODE_NODATASUM)
@@ -201,15 +200,15 @@ static int check_compressed_csum(struct btrfs_inode *inode,
if (memcmp(&csum, cb_sum, csum_size)) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
- ret = -EIO;
- goto fail;
+ if (btrfs_io_bio(bio)->device)
+ btrfs_dev_stat_inc_and_print(
+ btrfs_io_bio(bio)->device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ return -EIO;
}
cb_sum += csum_size;
-
}
- ret = 0;
-fail:
- return ret;
+ return 0;
}
/* when we finish reading compressed pages from the disk, we
@@ -244,7 +243,6 @@ static void end_compressed_bio_read(struct bio *bio)
* Record the correct mirror_num in cb->orig_bio so that
* read-repair can work properly.
*/
- ASSERT(btrfs_io_bio(cb->orig_bio));
btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
cb->mirror_num = mirror;
@@ -256,7 +254,7 @@ static void end_compressed_bio_read(struct bio *bio)
goto csum_failed;
inode = cb->inode;
- ret = check_compressed_csum(BTRFS_I(inode), cb,
+ ret = check_compressed_csum(BTRFS_I(inode), bio,
(u64)bio->bi_iter.bi_sector << 9);
if (ret)
goto csum_failed;
@@ -405,7 +403,7 @@ out:
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
-blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
@@ -413,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
unsigned long bytes_left;
@@ -421,7 +419,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
struct page *page;
u64 first_byte = disk_start;
blk_status_t ret;
- int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
@@ -429,7 +427,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
return BLK_STS_RESOURCE;
refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
- cb->inode = inode;
+ cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
@@ -455,7 +453,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
int submit = 0;
page = compressed_pages[pg_index];
- page->mapping = inode->i_mapping;
+ page->mapping = inode->vfs_inode.i_mapping;
if (bio->bi_iter.bi_size)
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 284a3ad31350..9f3dbe372631 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -8,6 +8,8 @@
#include <linux/sizes.h>
+struct btrfs_inode;
+
/*
* We want to make sure that amount of RAM required to uncompress an extent is
* reasonable, so we limit the total size in ram of a compressed extent to
@@ -88,7 +90,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
struct bio *bio);
-blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3a7648bff42c..70e49d8d4f6c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1196,7 +1196,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
switch (tm->op) {
case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
- /* Fallthrough */
+ fallthrough;
case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case MOD_LOG_KEY_REMOVE:
btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1501,6 +1501,22 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
return 0;
}
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static int comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
/*
* compare two keys in a memcmp fashion
*/
@@ -1513,6 +1529,7 @@ static int comp_keys(const struct btrfs_disk_key *disk,
return btrfs_comp_cpu_keys(&k1, k2);
}
+#endif
/*
* same as comp_keys only with two btrfs_key's
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 30ce7039bc27..9c7e466f27a9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -546,11 +546,6 @@ enum {
*/
BTRFS_FS_EXCL_OP,
/*
- * To info transaction_kthread we need an immediate commit so it
- * doesn't need to wait for commit_interval
- */
- BTRFS_FS_NEED_ASYNC_COMMIT,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -779,6 +774,7 @@ struct btrfs_fs_info {
u32 thread_pool_size;
struct kobject *space_info_kobj;
+ struct kobject *qgroups_kobj;
u64 total_pinned;
@@ -1009,6 +1005,10 @@ enum {
BTRFS_ROOT_DEAD_RELOC_TREE,
/* Mark dead root stored on device whose cleanup needs to be resumed */
BTRFS_ROOT_DEAD_TREE,
+ /* The root has a log tree. Used only for subvolume roots. */
+ BTRFS_ROOT_HAS_LOG_TREE,
+ /* Qgroup flushing is in progress */
+ BTRFS_ROOT_QGROUP_FLUSHING,
};
/*
@@ -1057,8 +1057,10 @@ struct btrfs_root {
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
struct list_head log_ctxs[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_writers;
atomic_t log_commit[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_batch;
int log_transid;
/* No matter the commit succeeds or not*/
@@ -1073,7 +1075,6 @@ struct btrfs_root {
u64 highest_objectid;
- u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1160,6 +1161,7 @@ struct btrfs_root {
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+ wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */
atomic_t nr_swapfiles;
@@ -1275,18 +1277,18 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
BTRFS_MOUNT_##opt)
#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-{ \
+do { \
if (!btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_set_opt(fs_info->mount_opt, opt); \
-}
+} while (0)
#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-{ \
+do { \
if (btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_clear_opt(fs_info->mount_opt, opt); \
-}
+} while (0)
/*
* Requests for changes that need to be done during transaction commit.
@@ -1893,6 +1895,52 @@ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Optimized helpers for little-endian architectures where CPU and on-disk
+ * structures have the same endianness and we can skip conversions.
+ */
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
+ const struct btrfs_disk_key *disk_key)
+{
+ memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *cpu_key)
+{
+ memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_node_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_item_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *cpu_key)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_dir_item_key(eb, item, disk_key);
+}
+
+#else
+
static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
const struct btrfs_disk_key *disk)
{
@@ -1934,6 +1982,8 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
btrfs_disk_key_to_cpu(key, &disk_key);
}
+#endif
+
/* struct btrfs_header */
BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
@@ -2230,7 +2280,8 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
}
/* struct btrfs_file_extent_item */
-BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
+ type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
struct btrfs_file_extent_item, disk_bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
@@ -2239,6 +2290,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
struct btrfs_file_extent_item, generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
+ struct btrfs_file_extent_item, ram_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
struct btrfs_file_extent_item, disk_num_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
@@ -2255,6 +2308,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
}
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
disk_bytenr, 64);
BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -2506,16 +2560,46 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+/*
+ * Different levels for to flush space when doing space reservations.
+ *
+ * The higher the level, the more methods we try to reclaim space.
+ */
enum btrfs_reserve_flush_enum {
/* If we are in the transaction, we can't flush anything.*/
BTRFS_RESERVE_NO_FLUSH,
+
/*
- * Flushing delalloc may cause deadlock somewhere, in this
- * case, use FLUSH LIMIT
+ * Flush space by:
+ * - Running delayed inode items
+ * - Allocating a new chunk
*/
BTRFS_RESERVE_FLUSH_LIMIT,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Running delayed refs
+ * - Running delalloc and waiting for ordered extents
+ * - Allocating a new chunk
+ */
BTRFS_RESERVE_FLUSH_EVICT,
+
+ /*
+ * Flush space by above mentioned methods and by:
+ * - Running delayed iputs
+ * - Commiting transaction
+ *
+ * Can be interruped by fatal signal.
+ */
BTRFS_RESERVE_FLUSH_ALL,
+
+ /*
+ * Pretty much the same as FLUSH_ALL, but can also steal space from
+ * global rsv.
+ *
+ * Can be interruped by fatal signal.
+ */
BTRFS_RESERVE_FLUSH_ALL_STEAL,
};
@@ -2829,8 +2913,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
- u64 file_start, int contig);
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ u64 file_start, int contig);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
@@ -2873,7 +2957,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2926,7 +3010,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
@@ -2960,7 +3044,7 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
int skip_pinned);
extern const struct file_operations btrfs_file_operations;
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
+ struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path, u64 start, u64 end,
u64 *drop_end, int drop_cache,
int replace_extent,
@@ -2976,10 +3060,13 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes);
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3192,7 +3279,7 @@ do { \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- if ((errno) != -EIO) { \
+ if ((errno) != -EIO && (errno) != -EROFS) { \
WARN(1, KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
(errno)); \
@@ -3376,7 +3463,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_recover_relocation(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 1245739a3a6e..0e354e9e57d0 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -237,10 +237,10 @@ commit_trans:
return 0;
}
-int btrfs_check_data_free_space(struct inode *inode,
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
/* align the range */
@@ -248,14 +248,14 @@ int btrfs_check_data_free_space(struct inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
+ ret = btrfs_alloc_data_chunk_ondemand(inode, len);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0)
- btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
else
ret = 0;
return ret;
@@ -269,16 +269,12 @@ int btrfs_check_data_free_space(struct inode *inode,
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_space_info *data_sinfo;
- /* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, fs_info->sectorsize) -
- round_down(start, fs_info->sectorsize);
- start = round_down(start, fs_info->sectorsize);
+ ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
@@ -293,17 +289,17 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
* This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
-void btrfs_free_reserved_data_space(struct inode *inode,
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
/* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, root->fs_info->sectorsize) -
- round_down(start, root->fs_info->sectorsize);
- start = round_down(start, root->fs_info->sectorsize);
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
- btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
btrfs_qgroup_free_data(inode, reserved, start, len);
}
@@ -557,7 +553,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode,
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
@@ -565,7 +561,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+ ret = btrfs_delalloc_reserve_metadata(inode, len);
if (ret < 0)
btrfs_free_reserved_data_space(inode, *reserved, start, len);
return ret;
@@ -583,10 +579,10 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode,
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free)
{
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
+ btrfs_delalloc_release_metadata(inode, len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 54466fbd7075..28bf5c3ef430 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -6,18 +6,18 @@
struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
-int btrfs_check_data_free_space(struct inode *inode,
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_free_reserved_data_space(struct inode *inode,
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode,
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
-int btrfs_delalloc_reserve_space(struct inode *inode,
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 5615320fa659..741c7e19c32f 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -619,6 +619,7 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
bg_list) {
list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
}
spin_unlock(&fs_info->unused_bgs_lock);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7c6f0bbb54a5..9ae25f632157 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1116,6 +1116,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
mutex_init(&root->delalloc_mutex);
+ init_waitqueue_head(&root->qgroup_flush_wait);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
@@ -1141,10 +1142,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- if (!dummy)
- root->defrag_trans_start = fs_info->generation;
- else
- root->defrag_trans_start = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
@@ -1395,7 +1392,12 @@ alloc_fail:
goto out;
}
-static int btrfs_init_fs_root(struct btrfs_root *root)
+/*
+ * Initialize subvolume root in-memory structure
+ *
+ * @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ */
+static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
int ret;
unsigned int nofs_flag;
@@ -1428,9 +1430,20 @@ static int btrfs_init_fs_root(struct btrfs_root *root)
spin_lock_init(&root->ino_cache_lock);
init_waitqueue_head(&root->ino_cache_wait);
- ret = get_anon_bdev(&root->anon_dev);
- if (ret)
- goto fail;
+ /*
+ * Don't assign anonymous block device to roots that are not exposed to
+ * userspace, the id pool is limited to 1M
+ */
+ if (is_fstree(root->root_key.objectid) &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ if (!anon_dev) {
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ } else {
+ root->anon_dev = anon_dev;
+ }
+ }
mutex_lock(&root->objectid_mutex);
ret = btrfs_find_highest_objectid(root,
@@ -1534,8 +1547,27 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
}
-struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, bool check_ref)
+/*
+ * Get an in-memory reference of a root structure.
+ *
+ * For essential trees like root/extent tree, we grab it from fs_info directly.
+ * For subvolume trees, we check the cached filesystem roots first. If not
+ * found, then read it from disk and add it to cached fs roots.
+ *
+ * Caller should release the root by calling btrfs_put_root() after the usage.
+ *
+ * NOTE: Reloc and log trees can't be read by this function as they share the
+ * same root objectid.
+ *
+ * @objectid: root id
+ * @anon_dev: preallocated anonymous block device number for new roots,
+ * pass 0 for new allocation.
+ * @check_ref: whether to check root item references, If true, return -ENOENT
+ * for orphan roots
+ */
+static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev,
+ bool check_ref)
{
struct btrfs_root *root;
struct btrfs_path *path;
@@ -1564,6 +1596,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
+ /* Shouldn't get preallocated anon_dev for cached roots */
+ ASSERT(!anon_dev);
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
btrfs_put_root(root);
return ERR_PTR(-ENOENT);
@@ -1583,7 +1617,7 @@ again:
goto fail;
}
- ret = btrfs_init_fs_root(root);
+ ret = btrfs_init_fs_root(root, anon_dev);
if (ret)
goto fail;
@@ -1616,25 +1650,31 @@ fail:
return ERR_PTR(ret);
}
-static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+/*
+ * Get in-memory reference of a root structure
+ *
+ * @objectid: tree objectid
+ * @check_ref: if set, verify that the tree exists and the item has at least
+ * one reference
+ */
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, bool check_ref)
{
- struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
- int ret = 0;
- struct btrfs_device *device;
- struct backing_dev_info *bdi;
+ return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+}
- rcu_read_lock();
- list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
- if (!device->bdev)
- continue;
- bdi = device->bdev->bd_bdi;
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- rcu_read_unlock();
- return ret;
+/*
+ * Get in-memory reference of a root structure, created as new, optionally pass
+ * the anonymous block device id
+ *
+ * @objectid: tree objectid
+ * @anon_dev: if zero, allocate a new anonymous block device or use the
+ * parameter value
+ */
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev)
+{
+ return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
/*
@@ -1749,7 +1789,6 @@ static int transaction_kthread(void *arg)
now = ktime_get_seconds();
if (cur->state < TRANS_STATE_COMMIT_START &&
- !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
(now < cur->start_time ||
now - cur->start_time < fs_info->commit_interval)) {
spin_unlock(&fs_info->trans_lock);
@@ -2001,8 +2040,7 @@ void btrfs_put_root(struct btrfs_root *root)
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
btrfs_drew_lock_destroy(&root->snapshot_lock);
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
+ free_root_extent_buffers(root);
kfree(root->free_ino_ctl);
kfree(root->free_ino_pinned);
#ifdef CONFIG_BTRFS_DEBUG
@@ -2593,10 +2631,12 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
!extent_buffer_uptodate(tree_root->node)) {
handle_error = true;
- if (IS_ERR(tree_root->node))
+ if (IS_ERR(tree_root->node)) {
ret = PTR_ERR(tree_root->node);
- else if (!extent_buffer_uptodate(tree_root->node))
+ tree_root->node = NULL;
+ } else if (!extent_buffer_uptodate(tree_root->node)) {
ret = -EUCLEAN;
+ }
btrfs_warn(fs_info, "failed to read tree root");
continue;
@@ -3051,8 +3091,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sb_buffer;
}
- sb->s_bdi->congested_fn = btrfs_congested_fn;
- sb->s_bdi->congested_data = fs_info;
sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
@@ -4056,6 +4094,11 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
+ if (btrfs_check_quota_leak(fs_info)) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err(fs_info, "qgroup reserved space leaked");
+ }
+
btrfs_free_qgroup_config(fs_info);
ASSERT(list_empty(&fs_info->delalloc_roots));
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bf43245406c4..00dc39d47ed3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index b6561455b3c4..f39d47a2d01a 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -233,14 +233,11 @@ bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
struct extent_state **cached_state);
/* This should be reworked in the future and put elsewhere. */
-int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec);
+struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
int set_state_failrec(struct extent_io_tree *tree, u64 start,
struct io_failure_record *failrec);
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
u64 end);
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret);
int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c0bc35f932bf..61ede335f6c3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5298,7 +5298,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction start. See
+ * wait_reserve_ticket and the whole reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
@@ -5466,6 +5473,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
}
}
+ /*
+ * This subvolume is going to be completely dropped, and won't be
+ * recorded as dirty roots, thus pertrans meta rsv will not be freed at
+ * commit transaction time. So free it here manually.
+ */
+ btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
+ btrfs_qgroup_free_meta_all_pertrans(root);
+
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
else
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68c96057ad2d..6def411b2eba 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1999,7 +1999,8 @@ static int __process_pages_contig(struct address_space *mapping,
if (!PageDirty(pages[i]) ||
pages[i]->mapping != mapping) {
unlock_page(pages[i]);
- put_page(pages[i]);
+ for (; i < ret; i++)
+ put_page(pages[i]);
err = -EAGAIN;
goto out;
}
@@ -2017,15 +2018,14 @@ out:
return err;
}
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned clear_bits,
unsigned long page_ops)
{
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
- NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
- __process_pages_contig(inode->i_mapping, locked_page,
+ __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start >> PAGE_SHIFT, end >> PAGE_SHIFT,
page_ops, NULL);
}
@@ -2122,12 +2122,11 @@ out:
return ret;
}
-int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec)
+struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
{
struct rb_node *node;
struct extent_state *state;
- int ret = 0;
+ struct io_failure_record *failrec;
spin_lock(&tree->lock);
/*
@@ -2136,18 +2135,19 @@ int get_state_failrec(struct extent_io_tree *tree, u64 start,
*/
node = tree_search(tree, start);
if (!node) {
- ret = -ENOENT;
+ failrec = ERR_PTR(-ENOENT);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
if (state->start != start) {
- ret = -ENOENT;
+ failrec = ERR_PTR(-ENOENT);
goto out;
}
- *failrec = state->failrec;
+
+ failrec = state->failrec;
out:
spin_unlock(&tree->lock);
- return ret;
+ return failrec;
}
/*
@@ -2377,8 +2377,8 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
if (!ret)
return 0;
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret)
+ failrec = get_state_failrec(failure_tree, start);
+ if (IS_ERR(failrec))
return 0;
BUG_ON(!failrec->this_mirror);
@@ -2450,8 +2450,8 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
spin_unlock(&failure_tree->lock);
}
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret)
+static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
+ u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct io_failure_record *failrec;
@@ -2462,65 +2462,8 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
int ret;
u64 logical;
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret) {
- failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
- if (!failrec)
- return -ENOMEM;
-
- failrec->start = start;
- failrec->len = end - start + 1;
- failrec->this_mirror = 0;
- failrec->bio_flags = 0;
- failrec->in_validation = 0;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, failrec->len);
- if (!em) {
- read_unlock(&em_tree->lock);
- kfree(failrec);
- return -EIO;
- }
-
- if (em->start > start || em->start + em->len <= start) {
- free_extent_map(em);
- em = NULL;
- }
- read_unlock(&em_tree->lock);
- if (!em) {
- kfree(failrec);
- return -EIO;
- }
-
- logical = start - em->start;
- logical = em->block_start + logical;
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- logical = em->block_start;
- failrec->bio_flags = EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&failrec->bio_flags,
- em->compress_type);
- }
-
- btrfs_debug(fs_info,
- "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
- logical, start, failrec->len);
-
- failrec->logical = logical;
- free_extent_map(em);
-
- /* set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret >= 0)
- ret = set_state_failrec(failure_tree, start, failrec);
- /* set the bits in the inode's tree */
- if (ret >= 0)
- ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
- if (ret < 0) {
- kfree(failrec);
- return ret;
- }
- } else {
+ failrec = get_state_failrec(failure_tree, start);
+ if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
failrec->logical, failrec->start, failrec->len,
@@ -2530,11 +2473,66 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
+
+ return failrec;
}
- *failrec_ret = failrec;
+ failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return ERR_PTR(-ENOMEM);
+
+ failrec->start = start;
+ failrec->len = end - start + 1;
+ failrec->this_mirror = 0;
+ failrec->bio_flags = 0;
+ failrec->in_validation = 0;
- return 0;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ if (em->start > start || em->start + em->len <= start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ logical = start - em->start;
+ logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&failrec->bio_flags, em->compress_type);
+ }
+
+ btrfs_debug(fs_info,
+ "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
+ logical, start, failrec->len);
+
+ failrec->logical = logical;
+ free_extent_map(em);
+
+ /* Set the bits in the private failure tree */
+ ret = set_extent_bits(failure_tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY);
+ if (ret >= 0) {
+ ret = set_state_failrec(failure_tree, start, failrec);
+ /* Set the bits in the inode's tree */
+ ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
+ } else if (ret < 0) {
+ kfree(failrec);
+ return ERR_PTR(ret);
+ }
+
+ return failrec;
}
static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
@@ -2659,16 +2657,15 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
blk_status_t status;
- int ret;
btrfs_debug(fs_info,
"repair read error: read error at %llu", start);
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
- if (ret)
- return errno_to_blk_status(ret);
+ failrec = btrfs_get_io_failure_record(inode, start, end);
+ if (IS_ERR(failrec))
+ return errno_to_blk_status(PTR_ERR(failrec));
need_validation = btrfs_io_needs_validation(inode, failed_bio);
@@ -3419,7 +3416,7 @@ static void update_nr_written(struct writeback_control *wbc,
* This returns 0 if all went well (page still locked)
* This returns < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int writepage_delalloc(struct inode *inode,
+static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc,
u64 delalloc_start, unsigned long *nr_written)
{
@@ -3432,7 +3429,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
while (delalloc_end < page_end) {
- found = find_lock_delalloc_range(inode, page,
+ found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
if (!found) {
@@ -3449,8 +3446,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
* started, so we don't want to return > 0 unless
* things are going well.
*/
- ret = ret < 0 ? ret : -EIO;
- goto done;
+ return ret < 0 ? ret : -EIO;
}
/*
* delalloc_end is already one less than the total length, so
@@ -3482,10 +3478,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
return 1;
}
- ret = 0;
-
-done:
- return ret;
+ return 0;
}
/*
@@ -3496,7 +3489,7 @@ done:
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
struct writeback_control *wbc,
struct extent_page_data *epd,
@@ -3504,7 +3497,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
unsigned long nr_written,
int *nr_ret)
{
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
u64 end;
@@ -3536,7 +3529,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
update_nr_written(wbc, nr_written + 1);
end = page_end;
- blocksize = inode->i_sb->s_blocksize;
+ blocksize = inode->vfs_inode.i_sb->s_blocksize;
while (cur <= end) {
u64 em_end;
@@ -3547,8 +3540,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page_end, 1);
break;
}
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur,
- end - cur + 1);
+ em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
ret = PTR_ERR_OR_ZERO(em);
@@ -3585,7 +3577,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
- btrfs_err(BTRFS_I(inode)->root->fs_info,
+ btrfs_err(inode->root->fs_info,
"page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
@@ -3658,15 +3650,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
set_page_extent_mapped(page);
if (!epd->extent_locked) {
- ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
+ &nr_written);
if (ret == 1)
return 0;
if (ret)
goto done;
}
- ret = __extent_writepage_io(inode, page, wbc, epd,
- i_size, nr_written, &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
+ nr_written, &nr);
if (ret == 1)
return 0;
@@ -4126,7 +4119,7 @@ retry:
if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
ret = flush_write_bio(&epd);
} else {
- ret = -EUCLEAN;
+ ret = -EROFS;
end_write_bio(&epd, ret);
}
return ret;
@@ -4488,6 +4481,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
+ struct btrfs_fs_info *fs_info;
+ u64 cur_gen;
+
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
@@ -4501,20 +4497,52 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
free_extent_map(em);
break;
}
- if (!test_range_bit(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED, 0, NULL)) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &btrfs_inode->runtime_flags);
- remove_extent_mapping(map, em);
- /* once for the rb tree */
- free_extent_map(em);
- }
+ if (test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED, 0, NULL))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used
+ * by a fast fsync, we can remove it. If it's being
+ * logged we can safely remove it since fsync took an
+ * extra reference on the em.
+ */
+ if (list_empty(&em->list) ||
+ test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it
+ * only if its generation is older then the current one,
+ * in which case we don't need it for a fast fsync.
+ * Otherwise don't remove it, we could be racing with an
+ * ongoing fast fsync that could miss the new extent.
+ */
+ fs_info = btrfs_inode->root->fs_info;
+ spin_lock(&fs_info->trans_lock);
+ cur_gen = fs_info->generation;
+ spin_unlock(&fs_info->trans_lock);
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there
+ * is no need to set the full fsync flag on the inode (it
+ * hurts the fsync performance for workloads with a data
+ * size that exceeds or is close to the system's memory).
+ */
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
+next:
start = extent_map_end(em);
write_unlock(&map->lock);
/* once for us */
free_extent_map(em);
+
+ cond_resched(); /* Allow large-extent preemption. */
}
}
return try_release_extent_state(tree, page, mask);
@@ -4669,7 +4697,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
}
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+ u64 start, u64 len)
{
int ret = 0;
u64 off = start;
@@ -5058,25 +5086,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
- /* the ref bit is tricky. We have to make sure it is set
- * if we have the buffer dirty. Otherwise the
- * code to free a buffer can end up dropping a dirty
- * page
+ /*
+ * The TREE_REF bit is first set when the extent_buffer is added
+ * to the radix tree. It is also reset, if unset, when a new reference
+ * is created by find_extent_buffer.
*
- * Once the ref bit is set, it won't go away while the
- * buffer is dirty or in writeback, and it also won't
- * go away while we have the reference count on the
- * eb bumped.
+ * It is only cleared in two cases: freeing the last non-tree
+ * reference to the extent_buffer when its STALE bit is set or
+ * calling releasepage when the tree reference is the only reference.
*
- * We can't just set the ref bit without bumping the
- * ref on the eb because free_extent_buffer might
- * see the ref bit and try to clear it. If this happens
- * free_extent_buffer might end up dropping our original
- * ref by mistake and freeing the page before we are able
- * to add one more ref.
+ * In both cases, care is taken to ensure that the extent_buffer's
+ * pages are not under io. However, releasepage can be concurrently
+ * called with creating new references, which is prone to race
+ * conditions between the calls to check_buffer_tree_ref in those
+ * codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * So bump the ref count first, then set the bit. If someone
- * beat us to it, drop the ref we added.
+ * The actual lifetime of the extent_buffer in the radix tree is
+ * adequately protected by the refcount, but the TREE_REF bit and
+ * its corresponding reference are not. To protect against this
+ * class of races, we call check_buffer_tree_ref from the codepaths
+ * which trigger io after they set eb->io_pages. Note that once io is
+ * initiated, TREE_REF can no longer be cleared, so that is the
+ * moment at which any such race is best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5527,6 +5558,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
+ /*
+ * It is possible for releasepage to clear the TREE_REF bit before we
+ * set io_pages. See check_buffer_tree_ref for a more detailed comment.
+ */
+ check_buffer_tree_ref(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 87f60a48f750..00a88f2eb5ab 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -204,7 +204,7 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len);
+ u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -277,7 +277,7 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(const struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 706a3128e192..7d5ec71615b8 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -522,10 +522,10 @@ fail:
* means this bio can contains potentially discontigous bio vecs
* so the logical offset of each should be calculated separately.
*/
-blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 file_start, int contig)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2c14312b05e8..bb824c7cb7c7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -500,18 +500,18 @@ next:
* this also makes the decision about creating an inline extent vs
* doing real data extents, marking pages dirty and delalloc as required.
*/
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int err = 0;
int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
start_pos = pos & ~((u64) fs_info->sectorsize - 1);
@@ -524,13 +524,13 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, cached);
- if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (!btrfs_is_free_space_inode(inode)) {
if (start_pos >= isize &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
/*
* There can't be any extents following eof in this case
* so just set the delalloc new bit for the range
@@ -538,8 +538,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
*/
extra_bits |= EXTENT_DELALLOC_NEW;
} else {
- err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
- start_pos,
+ err = btrfs_find_new_delalloc_bytes(inode, start_pos,
num_bytes, cached);
if (err)
return err;
@@ -564,7 +563,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
* at this time.
*/
if (end_pos > isize)
- i_size_write(inode, end_pos);
+ i_size_write(&inode->vfs_inode, end_pos);
return 0;
}
@@ -731,7 +730,7 @@ next:
* is deleted from the tree.
*/
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
+ struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path, u64 start, u64 end,
u64 *drop_end, int drop_cache,
int replace_extent,
@@ -744,7 +743,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct inode *vfs_inode = &inode->vfs_inode;
+ u64 ino = btrfs_ino(inode);
u64 search_start = start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
@@ -762,9 +762,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int leafs_visited = 0;
if (drop_cache)
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
- if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
+ if (start >= inode->disk_i_size && !replace_extent)
modify_tree = 0;
update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
@@ -935,7 +935,7 @@ next_slot:
extent_end - end);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, end - key.offset);
+ inode_sub_bytes(vfs_inode, end - key.offset);
break;
}
@@ -955,7 +955,7 @@ next_slot:
start - key.offset);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, extent_end - start);
+ inode_sub_bytes(vfs_inode, extent_end - start);
if (end == extent_end)
break;
@@ -979,7 +979,7 @@ delete_extent_item:
if (update_refs &&
extent_type == BTRFS_FILE_EXTENT_INLINE) {
- inode_sub_bytes(inode,
+ inode_sub_bytes(vfs_inode,
extent_end - key.offset);
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
@@ -993,7 +993,7 @@ delete_extent_item:
key.offset - extent_offset);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
- inode_sub_bytes(inode,
+ inode_sub_bytes(vfs_inode,
extent_end - key.offset);
}
@@ -1082,8 +1082,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
- drop_cache, 0, 0, NULL);
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
+ end, NULL, drop_cache, 0, 0, NULL);
btrfs_free_path(path);
return ret;
}
@@ -1532,8 +1532,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
return ret;
}
-static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
+static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1541,32 +1541,87 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
u64 num_bytes;
int ret;
- if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
+ if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return 0;
+
+ if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
+ num_bytes = lockend - lockstart + 1;
- btrfs_lock_and_flush_ordered_range(inode, lockstart,
- lockend, NULL);
+ if (nowait) {
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
+ return -EAGAIN;
+
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ num_bytes);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ } else {
+ btrfs_lock_and_flush_ordered_range(inode, lockstart,
+ lockend, NULL);
+ }
- num_bytes = lockend - lockstart + 1;
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ if (!nowait)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
}
-
+out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend);
return ret;
}
+static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ return check_can_nocow(inode, pos, write_bytes, true);
+}
+
+/*
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
+ *
+ * @pos: File offset
+ * @write_bytes: The length to write, will be updated to the nocow writeable
+ * range
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks.
+ *
+ * Return:
+ * >0 and update @write_bytes if we can do nocow write
+ * 0 if we can't do nocow write
+ * -EAGAIN if we can't get the needed lock or there are ordered extents
+ * for * (nowait == true) case
+ * <0 if other error happened
+ *
+ * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
+ */
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ return check_can_nocow(inode, pos, write_bytes, false);
+}
+
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
+{
+ btrfs_drew_write_unlock(&inode->root->snapshot_lock);
+}
+
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct iov_iter *i)
{
@@ -1574,7 +1629,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
@@ -1627,13 +1681,12 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
fs_info->sectorsize);
extent_changeset_release(data_reserved);
- ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &data_reserved, pos,
write_bytes);
if (ret < 0) {
- if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) &&
- check_can_nocow(BTRFS_I(inode), pos,
- &write_bytes) > 0) {
+ if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+ &write_bytes) > 0) {
/*
* For nodata cow case, no need to reserve
* data space.
@@ -1658,11 +1711,11 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, pos,
write_bytes);
else
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
break;
}
@@ -1732,7 +1785,7 @@ again:
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, __pos,
release_bytes, true);
}
@@ -1742,8 +1795,9 @@ again:
fs_info->sectorsize);
if (copied > 0)
- ret = btrfs_dirty_pages(inode, pages, dirty_pages,
- pos, copied, &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
+ dirty_pages, pos, copied,
+ &cached_state);
/*
* If we have not locked the extent range, because the range's
@@ -1766,7 +1820,7 @@ again:
release_bytes = 0;
if (only_release_metadata)
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos,
@@ -1784,8 +1838,6 @@ again:
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
- btrfs_btree_balance_dirty(fs_info);
pos += copied;
num_written += copied;
@@ -1795,11 +1847,12 @@ again:
if (release_bytes) {
if (only_release_metadata) {
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved,
round_down(pos, fs_info->sectorsize),
release_bytes, true);
}
@@ -1904,13 +1957,23 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
pos = iocb->ki_pos;
count = iov_iter_count(from);
if (iocb->ki_flags & IOCB_NOWAIT) {
+ size_t nocow_bytes = count;
+
/*
* We will allocate space in case nodatacow is not set,
* so bail
*/
- if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) ||
- check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+ if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
+ <= 0) {
+ inode_unlock(inode);
+ return -EAGAIN;
+ }
+ /*
+ * There are holes in the range or parts of the range that must
+ * be COWed (shared extents, RO block groups, etc), so just bail
+ * out.
+ */
+ if (nocow_bytes < count) {
inode_unlock(inode);
return -EAGAIN;
}
@@ -2570,7 +2633,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
cur_offset = start;
while (cur_offset < end) {
- ret = __btrfs_drop_extents(trans, root, inode, path,
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
cur_offset, end + 1, &drop_end,
1, 0, 0, NULL);
if (ret != -ENOSPC) {
@@ -3148,14 +3211,14 @@ reserve_space:
if (ret < 0)
goto out;
space_reserved = true;
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
- alloc_start, bytes_to_reserve);
- if (ret)
- goto out;
ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
if (ret)
goto out;
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
+ alloc_start, bytes_to_reserve);
+ if (ret)
+ goto out;
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
@@ -3171,7 +3234,7 @@ reserve_space:
ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
out:
if (ret && space_reserved)
- btrfs_free_reserved_data_space(inode, data_reserved,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
alloc_start, bytes_to_reserve);
extent_changeset_free(data_reserved);
@@ -3322,8 +3385,9 @@ static long btrfs_fallocate(struct file *file, int mode,
free_extent_map(em);
break;
}
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
- cur_offset, last_byte - cur_offset);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
+ &data_reserved, cur_offset,
+ last_byte - cur_offset);
if (ret < 0) {
cur_offset = last_byte;
free_extent_map(em);
@@ -3335,8 +3399,9 @@ static long btrfs_fallocate(struct file *file, int mode,
* range, free reserved data space first, otherwise
* it'll result in false ENOSPC error.
*/
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, last_byte - cur_offset);
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, cur_offset,
+ last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -3353,7 +3418,7 @@ static long btrfs_fallocate(struct file *file, int mode,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
else
- btrfs_free_reserved_data_space(inode,
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, range->start,
range->len);
list_del(&range->list);
@@ -3374,7 +3439,7 @@ out:
inode_unlock(inode);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
- btrfs_free_reserved_data_space(inode, data_reserved,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
@@ -3472,7 +3537,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return generic_file_open(inode, filp);
}
@@ -3481,6 +3546,7 @@ const struct file_operations btrfs_file_operations = {
.read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
+ .splice_write = iter_file_splice_write,
.mmap = btrfs_file_mmap,
.open = btrfs_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 55955bd424d7..6d961e11639e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1334,8 +1334,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
- i_size_read(inode), &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
+ io_ctl->num_pages, 0, i_size_read(inode),
+ &cached_state);
if (ret)
goto out_nospc;
@@ -2703,8 +2704,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
* pointed to by the cluster, someone else raced in and freed the
* cluster already. In that case, we just return without changing anything
*/
-static int
-__btrfs_return_cluster_to_free_space(
+static void __btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
@@ -2756,7 +2756,6 @@ __btrfs_return_cluster_to_free_space(
out:
spin_unlock(&cluster->lock);
btrfs_put_block_group(block_group);
- return 0;
}
static void __btrfs_remove_free_space_cache_locked(
@@ -2907,12 +2906,11 @@ out:
* Otherwise, it'll get a reference on the block group pointed to by the
* cluster and remove the cluster from it.
*/
-int btrfs_return_cluster_to_free_space(
+void btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl;
- int ret;
/* first, get a safe pointer to the block group */
spin_lock(&cluster->lock);
@@ -2920,28 +2918,27 @@ int btrfs_return_cluster_to_free_space(
block_group = cluster->block_group;
if (!block_group) {
spin_unlock(&cluster->lock);
- return 0;
+ return;
}
} else if (cluster->block_group != block_group) {
/* someone else has already freed it don't redo their work */
spin_unlock(&cluster->lock);
- return 0;
+ return;
}
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
spin_unlock(&cluster->lock);
ctl = block_group->free_space_ctl;
/* now return any extents the cluster had on it */
spin_lock(&ctl->tree_lock);
- ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+ __btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&ctl->tree_lock);
btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
/* finally drop our ref */
btrfs_put_block_group(block_group);
- return ret;
}
static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
@@ -3358,7 +3355,7 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
list_del_init(&entry->list);
if (!ret) {
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
list_add_tail(&cluster->block_group_list,
&block_group->cluster_list);
cluster->block_group = block_group;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 2e0a8077aa74..e3d5e0ad8f8e 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -136,7 +136,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size);
-int btrfs_return_cluster_to_free_space(
+void btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster);
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 6009e0e939b5..76d2e43817ea 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -495,7 +495,8 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_SIZE;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0,
+ prealloc);
if (ret)
goto out_put;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d04c82c88418..611b3412fbfd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -80,17 +80,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
-static noinline int cow_file_range(struct inode *inode,
+static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock);
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
- u64 orig_start, u64 block_start,
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type);
-static void __endio_write_update_ordered(struct inode *inode,
+static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate);
@@ -104,7 +104,7 @@ static void __endio_write_update_ordered(struct inode *inode,
* to be released, which we want to happen only when finishing the ordered
* extent (btrfs_finish_ordered_io()).
*/
-static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *locked_page,
u64 offset, u64 bytes)
{
@@ -116,7 +116,7 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
struct page *page;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
index++;
if (!page)
continue;
@@ -274,15 +274,15 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct inode *inode, u64 start,
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
u64 end, size_t compressed_size,
int compress_type,
struct page **compressed_pages)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
u64 actual_end = min(end + 1, isize);
u64 inline_len = actual_end - start;
u64 aligned_end = ALIGN(end, fs_info->sectorsize);
@@ -314,7 +314,7 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
btrfs_free_path(path);
return PTR_ERR(trans);
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+ trans->block_rsv = &inode->block_rsv;
if (compressed_size && compressed_pages)
extent_item_size = btrfs_file_extent_calc_inline_size(
@@ -323,9 +323,9 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
extent_item_size = btrfs_file_extent_calc_inline_size(
inline_len);
- ret = __btrfs_drop_extents(trans, root, inode, path,
- start, aligned_end, NULL,
- 1, 1, extent_item_size, &extent_inserted);
+ ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end,
+ NULL, 1, 1, extent_item_size,
+ &extent_inserted);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -334,7 +334,7 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end);
ret = insert_inline_extent(trans, path, extent_inserted,
- root, inode, start,
+ root, &inode->vfs_inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
if (ret && ret != -ENOSPC) {
@@ -345,8 +345,8 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
goto out;
}
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
- btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -412,10 +412,10 @@ static noinline int add_async_extent(struct async_chunk *cow,
/*
* Check if the inode has flags compatible with compression
*/
-static inline bool inode_can_compress(struct inode *inode)
+static inline bool inode_can_compress(struct btrfs_inode *inode)
{
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
- BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
return false;
return true;
}
@@ -424,29 +424,30 @@ static inline bool inode_can_compress(struct inode *inode)
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
-static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
+static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
+ u64 end)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!inode_can_compress(inode)) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
- btrfs_ino(BTRFS_I(inode)));
+ btrfs_ino(inode));
return 0;
}
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
/* defrag ioctl */
- if (BTRFS_I(inode)->defrag_compress)
+ if (inode->defrag_compress)
return 1;
/* bad compression ratios */
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
if (btrfs_test_opt(fs_info, COMPRESS) ||
- BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
- BTRFS_I(inode)->prop_compress)
- return btrfs_compress_heuristic(inode, start, end);
+ inode->flags & BTRFS_INODE_COMPRESS ||
+ inode->prop_compress)
+ return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
return 0;
}
@@ -552,7 +553,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (inode_need_compress(inode, start, end)) {
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -616,11 +617,12 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ 0, BTRFS_COMPRESS_NONE,
+ NULL);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(inode, start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
total_compressed,
compress_type, pages);
}
@@ -642,7 +644,8 @@ cont:
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, NULL,
+ extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
+ NULL,
clear_flags,
PAGE_UNLOCK |
PAGE_CLEAR_DIRTY |
@@ -762,14 +765,14 @@ static void free_async_extent_pages(struct async_extent *async_extent)
*/
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
{
- struct inode *inode = async_chunk->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_extent *async_extent;
u64 alloc_hint = 0;
struct btrfs_key ins;
struct extent_map *em;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct extent_io_tree *io_tree = &inode->io_tree;
int ret = 0;
again:
@@ -802,7 +805,7 @@ retry:
* all those pages down to the drive.
*/
if (!page_started && !ret)
- extent_write_locked_range(inode,
+ extent_write_locked_range(&inode->vfs_inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
@@ -832,7 +835,7 @@ retry:
* will not submit these pages down to lower
* layers.
*/
- extent_range_redirty_for_io(inode,
+ extent_range_redirty_for_io(&inode->vfs_inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1);
@@ -867,8 +870,7 @@ retry:
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
if (ret) {
- btrfs_drop_extent_cache(BTRFS_I(inode),
- async_extent->start,
+ btrfs_drop_extent_cache(inode, async_extent->start,
async_extent->start +
async_extent->ram_size - 1, 0);
goto out_free_reserve;
@@ -884,8 +886,7 @@ retry:
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK);
- if (btrfs_submit_compressed_write(inode,
- async_extent->start,
+ if (btrfs_submit_compressed_write(inode, async_extent->start,
async_extent->ram_size,
ins.objectid,
ins.offset, async_extent->pages,
@@ -896,12 +897,11 @@ retry:
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
- p->mapping = inode->i_mapping;
+ p->mapping = inode->vfs_inode.i_mapping;
btrfs_writepage_endio_finish_ordered(p, start, end, 0);
p->mapping = NULL;
- extent_clear_unlock_delalloc(inode, start, end,
- NULL, 0,
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
PAGE_END_WRITEBACK |
PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
@@ -929,10 +929,10 @@ out_free:
goto again;
}
-static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
u64 num_bytes)
{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 alloc_hint = 0;
@@ -974,17 +974,18 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
* required to start IO on it. It may be clean and already done with
* IO when we return.
*/
-static noinline int cow_file_range(struct inode *inode,
+static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
+ u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
@@ -993,7 +994,7 @@ static noinline int cow_file_range(struct inode *inode,
bool extent_reserved = false;
int ret = 0;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(inode)) {
WARN_ON_ONCE(1);
ret = -EINVAL;
goto out_unlock;
@@ -1003,7 +1004,7 @@ static noinline int cow_file_range(struct inode *inode,
num_bytes = max(blocksize, num_bytes);
ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
- inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
+ inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
if (start == 0) {
/* lets try to make an inline extent */
@@ -1032,13 +1033,28 @@ static noinline int cow_file_range(struct inode *inode,
}
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
- start + num_bytes - 1, 0);
+ btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+ /*
+ * Relocation relies on the relocated extents to have exactly the same
+ * size as the original extents. Normally writeback for relocation data
+ * extents follows a NOCOW path because relocation preallocates the
+ * extents. However, due to an operation such as scrub turning a block
+ * group to RO mode, it may fallback to COW mode, so we must make sure
+ * an extent allocated during COW has exactly the requested size and can
+ * not be split into smaller extents, otherwise relocation breaks and
+ * fails during the stage where it updates the bytenr of file extent
+ * items.
+ */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ min_alloc_size = num_bytes;
+ else
+ min_alloc_size = fs_info->sectorsize;
while (num_bytes > 0) {
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
- fs_info->sectorsize, 0, alloc_hint,
+ min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -1081,7 +1097,7 @@ static noinline int cow_file_range(struct inode *inode,
* skip current ordered extent.
*/
if (ret)
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
+ btrfs_drop_extent_cache(inode, start,
start + ram_size - 1, 0);
}
@@ -1097,8 +1113,7 @@ static noinline int cow_file_range(struct inode *inode,
page_ops = unlock ? PAGE_UNLOCK : 0;
page_ops |= PAGE_SET_PRIVATE2;
- extent_clear_unlock_delalloc(inode, start,
- start + ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
@@ -1122,7 +1137,7 @@ out:
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
+ btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@@ -1219,13 +1234,13 @@ static noinline void async_cow_free(struct btrfs_work *work)
kvfree(async_chunk->pending);
}
-static int cow_file_range_async(struct inode *inode,
+static int cow_file_range_async(struct btrfs_inode *inode,
struct writeback_control *wbc,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
struct async_cow *ctx;
struct async_chunk *async_chunk;
@@ -1237,9 +1252,9 @@ static int cow_file_range_async(struct inode *inode,
unsigned nofs_flag;
const unsigned int write_flags = wbc_to_write_flags(wbc);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
!btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
num_chunks = 1;
should_compress = false;
@@ -1277,9 +1292,9 @@ static int cow_file_range_async(struct inode *inode,
* igrab is called higher up in the call chain, take only the
* lightweight reference for the callback lifetime
*/
- ihold(inode);
+ ihold(&inode->vfs_inode);
async_chunk[i].pending = &ctx->num_chunks;
- async_chunk[i].inode = inode;
+ async_chunk[i].inode = &inode->vfs_inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
async_chunk[i].write_flags = write_flags;
@@ -1356,13 +1371,15 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
return 1;
}
-static int fallback_to_cow(struct inode *inode, struct page *locked_page,
+static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const u64 start, const u64 end,
int *page_started, unsigned long *nr_written)
{
- const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
+ const bool is_space_ino = btrfs_is_free_space_inode(inode);
+ const bool is_reloc_ino = (inode->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID);
const u64 range_bytes = end + 1 - start;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
u64 range_start = start;
u64 count;
@@ -1391,18 +1408,23 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
* data space info, which we incremented in the step above.
*
* If we need to fallback to cow and the inode corresponds to a free
- * space cache inode, we must also increment bytes_may_use of the data
- * space_info for the same reason. Space caches always get a prealloc
+ * space cache inode or an inode of the data relocation tree, we must
+ * also increment bytes_may_use of the data space_info for the same
+ * reason. Space caches and relocated data extents always get a prealloc
* extent for them, however scrub or balance may have set the block
- * group that contains that extent to RO mode.
+ * group that contains that extent to RO mode and therefore force COW
+ * when starting writeback.
*/
count = count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0);
- if (count > 0 || is_space_ino) {
- const u64 bytes = is_space_ino ? range_bytes : count;
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ if (count > 0 || is_space_ino || is_reloc_ino) {
+ u64 bytes = count;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+ if (is_space_ino || is_reloc_ino)
+ bytes = range_bytes;
+
spin_lock(&sinfo->lock);
btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
spin_unlock(&sinfo->lock);
@@ -1423,21 +1445,21 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
-static noinline int run_delalloc_nocow(struct inode *inode,
+static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
const u64 start, const u64 end,
int *page_started, int force,
unsigned long *nr_written)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
u64 cur_offset = start;
int ret;
bool check_prev = true;
- const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ const bool freespace_inode = btrfs_is_free_space_inode(inode);
+ u64 ino = btrfs_ino(inode);
bool nocow = false;
u64 disk_bytenr = 0;
@@ -1663,15 +1685,11 @@ out_check:
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
- ret = fallback_to_cow(inode, locked_page, cow_start,
- found_key.offset - 1,
+ ret = fallback_to_cow(inode, locked_page,
+ cow_start, found_key.offset - 1,
page_started, nr_written);
- if (ret) {
- if (nocow)
- btrfs_dec_nocow_writers(fs_info,
- disk_bytenr);
+ if (ret)
goto error;
- }
cow_start = (u64)-1;
}
@@ -1687,9 +1705,6 @@ out_check:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
- if (nocow)
- btrfs_dec_nocow_writers(fs_info,
- disk_bytenr);
ret = PTR_ERR(em);
goto error;
}
@@ -1699,8 +1714,7 @@ out_check:
num_bytes,
BTRFS_ORDERED_PREALLOC);
if (ret) {
- btrfs_drop_extent_cache(BTRFS_I(inode),
- cur_offset,
+ btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + num_bytes - 1,
0);
goto error;
@@ -1776,11 +1790,11 @@ error:
return ret;
}
-static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end)
{
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
+ !(inode->flags & BTRFS_INODE_PREALLOC))
return 0;
/*
@@ -1788,9 +1802,8 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
* if is not zero, it means the file is defragging.
* Force cow if given extent needs to be defragged.
*/
- if (BTRFS_I(inode)->defrag_bytes &&
- test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
- EXTENT_DEFRAG, 0, NULL))
+ if (inode->defrag_bytes &&
+ test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL))
return 1;
return 0;
@@ -1800,26 +1813,25 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
int ret;
int force_cow = need_force_cow(inode, start, end);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+ if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
- } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ page_started, nr_written, 1);
} else {
- set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
page_started, nr_written);
}
@@ -2068,9 +2080,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
do_list && !(state->state & EXTENT_NORESERVE) &&
(*bits & EXTENT_CLEAR_DATA_RESV))
- btrfs_free_reserved_data_space_noquota(
- &inode->vfs_inode,
- state->start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
fs_info->delalloc_batch);
@@ -2146,7 +2156,7 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
struct inode *inode = private_data;
blk_status_t ret = 0;
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -2211,7 +2221,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
0, inode, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
if (ret)
goto out;
}
@@ -2248,13 +2258,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state)
{
WARN_ON(PAGE_ALIGNED(end));
- return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
- extra_bits, cached_state);
+ return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
+ cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -2271,7 +2281,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
struct page *page;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 page_start;
u64 page_end;
int ret = 0;
@@ -2279,7 +2289,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
- inode = fixup->inode;
+ inode = BTRFS_I(fixup->inode);
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_SIZE - 1;
@@ -2316,8 +2326,7 @@ again:
* when the page was already properly dealt with.
*/
if (!ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved,
page_start, PAGE_SIZE,
true);
@@ -2333,20 +2342,18 @@ again:
if (ret)
goto out_page;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
- &cached_state);
+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (PagePrivate2(page))
goto out_reserved;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
- PAGE_SIZE);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
- page_end, &cached_state);
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
+ &cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -2366,11 +2373,11 @@ again:
BUG_ON(!PageDirty(page));
free_delalloc_space = false;
out_reserved:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
out_page:
if (ret) {
@@ -2393,7 +2400,7 @@ out_page:
* that could need flushing space. Recursing back to fixup worker would
* deadlock.
*/
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(&inode->vfs_inode);
}
/*
@@ -2449,18 +2456,18 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
}
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct inode *inode, u64 file_pos,
- u64 disk_bytenr, u64 disk_num_bytes,
- u64 num_bytes, u64 ram_bytes,
- u8 compression, u8 encryption,
- u16 other_encoding, int extent_type)
+ struct btrfs_inode *inode, u64 file_pos,
+ struct btrfs_file_extent_item *stack_fi,
+ u64 qgroup_reserved)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_file_extent_item *fi;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
- u64 qg_released;
+ u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
+ u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
+ u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
+ u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
int extent_inserted = 0;
int ret;
@@ -2479,60 +2486,42 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
*/
ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
file_pos + num_bytes, NULL, 0,
- 1, sizeof(*fi), &extent_inserted);
+ 1, sizeof(*stack_fi), &extent_inserted);
if (ret)
goto out;
if (!extent_inserted) {
- ins.objectid = btrfs_ino(BTRFS_I(inode));
+ ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
- sizeof(*fi));
+ sizeof(*stack_fi));
if (ret)
goto out;
}
leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_set_file_extent_type(leaf, fi, extent_type);
- btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, fi, 0);
- btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
- btrfs_set_file_extent_compression(leaf, fi, compression);
- btrfs_set_file_extent_encryption(leaf, fi, encryption);
- btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+ btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
+ write_extent_buffer(leaf, stack_fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_file_extent_item));
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- inode_add_bytes(inode, num_bytes);
+ inode_add_bytes(&inode->vfs_inode, num_bytes);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos,
- ram_bytes);
+ ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
goto out;
- /*
- * Release the reserved range from inode dirty range map, as it is
- * already moved into delayed_ref_head
- */
- ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
- if (ret < 0)
- goto out;
- qg_released = ret;
- ret = btrfs_alloc_reserved_file_extent(trans, root,
- btrfs_ino(BTRFS_I(inode)),
- file_pos, qg_released, &ins);
+ ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
+ file_pos, qgroup_reserved, &ins);
out:
btrfs_free_path(path);
@@ -2554,7 +2543,33 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(cache);
}
-/* as ordered data IO finishes, this gets called so we can finish
+static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_ordered_extent *oe)
+{
+ struct btrfs_file_extent_item stack_fi;
+ u64 logical_len;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
+ oe->disk_num_bytes);
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
+ logical_len = oe->truncated_len;
+ else
+ logical_len = oe->num_bytes;
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
+ btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset,
+ &stack_fi, oe->qgroup_rsv);
+}
+
+/*
+ * As ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
@@ -2605,13 +2620,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
- /*
- * For mwrite(mmap + memset to write) case, we still reserve
- * space for NOCOW range.
- * As NOCOW won't cause a new delayed ref, just free the space
- */
- btrfs_qgroup_free_data(inode, NULL, start,
- ordered_extent->num_bytes);
btrfs_inode_safe_disk_i_size_write(inode, 0);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
@@ -2648,20 +2656,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
- btrfs_qgroup_free_data(inode, NULL, start,
- ordered_extent->num_bytes);
ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_reserved_file_extent(trans, inode, start,
- ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes,
- logical_len, logical_len,
- compress_type, 0, 0,
- BTRFS_FILE_EXTENT_REG);
+ ret = insert_ordered_extent_file_extent(trans, inode,
+ ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
@@ -2813,6 +2815,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
io_bio->mirror_num);
+ if (io_bio->device)
+ btrfs_dev_stat_inc_and_print(io_bio->device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
kunmap_atomic(kaddr);
@@ -3331,6 +3336,14 @@ cache_index:
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ /*
+ * Same logic as for last_unlink_trans. We don't persist the generation
+ * of the last transaction where this inode was used for a reflink
+ * operation, so after eviction and reloading the inode we must be
+ * pessimistic and assume the last transaction that modified the inode.
+ */
+ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+
path->slots[0]++;
if (inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
@@ -3479,7 +3492,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
fill_inode_item(trans, leaf, inode_item, inode);
btrfs_mark_buffer_dirty(leaf);
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
ret = 0;
failed:
btrfs_free_path(path);
@@ -3509,7 +3522,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
ret = btrfs_delayed_update_inode(trans, root, inode);
if (!ret)
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
return ret;
}
@@ -4024,6 +4037,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
}
}
+ free_anon_bdev(dest->anon_dev);
+ dest->anon_dev = 0;
out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
@@ -4494,11 +4509,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
char *kaddr;
+ bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
struct page *page;
gfp_t mask = btrfs_alloc_write_mask(mapping);
+ size_t write_bytes = blocksize;
int ret = 0;
u64 block_start;
u64 block_end;
@@ -4510,15 +4527,28 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
block_start = round_down(from, blocksize);
block_end = block_start + blocksize - 1;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- block_start, blocksize);
- if (ret)
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved,
+ block_start, blocksize);
+ if (ret < 0) {
+ if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start,
+ &write_bytes) > 0) {
+ /* For nocow case, no need to reserve data space */
+ only_release_metadata = true;
+ } else {
+ goto out;
+ }
+ }
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
+ if (ret < 0) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, block_start, blocksize);
goto out;
-
+ }
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
block_start, blocksize, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
ret = -ENOMEM;
@@ -4543,7 +4573,7 @@ again:
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
set_page_extent_mapped(page);
- ordered = btrfs_lookup_ordered_extent(inode, block_start);
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start);
if (ordered) {
unlock_extent_cached(io_tree, block_start, block_end,
&cached_state);
@@ -4558,7 +4588,7 @@ again:
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state);
- ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0,
&cached_state);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
@@ -4583,14 +4613,26 @@ again:
set_page_dirty(page);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
+ if (only_release_metadata)
+ set_extent_bit(&BTRFS_I(inode)->io_tree, block_start,
+ block_end, EXTENT_NORESERVE, NULL, NULL,
+ GFP_NOFS);
+
out_unlock:
- if (ret)
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ if (ret) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ blocksize, true);
+ else
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
+ block_start, blocksize, true);
+ }
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
unlock_page(page);
put_page(page);
out:
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
extent_changeset_free(data_reserved);
return ret;
}
@@ -4948,7 +4990,8 @@ static void evict_inode_truncate_pages(struct inode *inode)
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
if (state_flags & EXTENT_DELALLOC)
- btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
+ end - start + 1);
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DELALLOC |
@@ -6023,7 +6066,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode_tree_add(inode);
trace_btrfs_inode_new(inode);
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
btrfs_update_root_times(trans, root);
@@ -6832,7 +6875,7 @@ out:
return em;
}
-static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
const u64 start,
const u64 len,
const u64 orig_start,
@@ -6846,21 +6889,19 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
int ret;
if (type != BTRFS_ORDERED_NOCOW) {
- em = create_io_em(inode, start, len, orig_start,
- block_start, block_len, orig_block_len,
- ram_bytes,
+ em = create_io_em(inode, start, len, orig_start, block_start,
+ block_len, orig_block_len, ram_bytes,
BTRFS_COMPRESS_NONE, /* compress_type */
type);
if (IS_ERR(em))
goto out;
}
- ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
- len, block_len, type);
+ ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
+ block_len, type);
if (ret) {
if (em) {
free_extent_map(em);
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
- start + len - 1, 0);
+ btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
}
em = ERR_PTR(ret);
}
@@ -6869,11 +6910,11 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
return em;
}
-static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
u64 start, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_map *em;
struct btrfs_key ins;
u64 alloc_hint;
@@ -6890,15 +6931,32 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
ins.offset, BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid,
- ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
+ 1);
return em;
}
/*
- * returns 1 when the nocow is safe, < 1 on error, 0 if the
- * block must be cow'd
+ * Check if we can do nocow write into the range [@offset, @offset + @len)
+ *
+ * @offset: File offset
+ * @len: The length to write, will be updated to the nocow writeable
+ * range
+ * @orig_start: (optional) Return the original file offset of the file extent
+ * @orig_len: (optional) Return the original on-disk length of the file extent
+ * @ram_bytes: (optional) Return the ram_bytes of the file extent
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks for (nowait == false) case.
+ *
+ * Return:
+ * >0 and update @len if we can do nocow write
+ * 0 if we can't do nocow write
+ * <0 if error happened
+ *
+ * NOTE: This only checks the file extents, caller is responsible to wait for
+ * any ordered extents.
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
@@ -7125,8 +7183,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
}
/* The callers of this must take lock_extent() */
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
- u64 orig_start, u64 block_start,
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type)
@@ -7140,7 +7198,7 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
- em_tree = &BTRFS_I(inode)->extent_tree;
+ em_tree = &inode->extent_tree;
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7162,8 +7220,8 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
}
do {
- btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
- em->start + em->len - 1, 0);
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
@@ -7242,7 +7300,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
btrfs_inc_nocow_writers(fs_info, block_start)) {
struct extent_map *em2;
- em2 = btrfs_create_dio_extent(inode, start, len,
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
orig_start, block_start,
len, orig_block_len,
ram_bytes, type);
@@ -7261,8 +7319,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
* use the existing or preallocated extent, so does not
* need to adjust btrfs_space_info's bytes_may_use.
*/
- btrfs_free_reserved_data_space_noquota(inode, start,
- len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
goto skip_cow;
}
}
@@ -7270,7 +7327,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
/* this will cow the extent */
len = bh_result->b_size;
free_extent_map(em);
- *map = em = btrfs_new_extent_direct(inode, start, len);
+ *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -7421,7 +7478,8 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
return;
if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
- __endio_write_update_ordered(dip->inode, dip->logical_offset,
+ __endio_write_update_ordered(BTRFS_I(dip->inode),
+ dip->logical_offset,
dip->bytes,
!dip->dio_bio->bi_status);
} else {
@@ -7507,18 +7565,18 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
return err;
}
-static void __endio_write_update_ordered(struct inode *inode,
+static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_extent *ordered = NULL;
struct btrfs_workqueue *wq;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
u64 last_offset;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+ if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
@@ -7526,9 +7584,9 @@ static void __endio_write_update_ordered(struct inode *inode,
while (ordered_offset < offset + bytes) {
last_offset = ordered_offset;
if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate)) {
+ &ordered_offset,
+ ordered_bytes,
+ uptodate)) {
btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
NULL);
btrfs_queue_work(wq, &ordered->work);
@@ -7555,7 +7613,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
{
struct inode *inode = private_data;
blk_status_t ret;
- ret = btrfs_csum_one_bio(inode, bio, offset, 1);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -7616,7 +7674,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
if (ret)
goto err;
} else {
@@ -7865,11 +7923,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
}
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
offset, count);
if (ret)
goto out;
@@ -7901,8 +7956,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, dio_data.reserve, true);
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, offset, dio_data.reserve,
+ true);
/*
* On error we might have left some ordered extents
* without submitting corresponding bios for them, so
@@ -7911,13 +7967,13 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
*/
if (dio_data.unsubmitted_oe_range_start <
dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(inode,
+ __endio_write_update_ordered(BTRFS_I(inode),
dio_data.unsubmitted_oe_range_start,
dio_data.unsubmitted_oe_range_end -
dio_data.unsubmitted_oe_range_start,
false);
} else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
offset, count - (size_t)ret, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), count);
}
@@ -7932,7 +7988,7 @@ out:
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+ u64 start, u64 len)
{
int ret;
@@ -8109,20 +8165,17 @@ again:
/*
* Qgroup reserved space handler
* Page here will be either
- * 1) Already written to disk
- * In this case, its reserved space is released from data rsv map
- * and will be freed by delayed_ref handler finally.
- * So even we call qgroup_free_data(), it won't decrease reserved
- * space.
- * 2) Not written to disk
- * This means the reserved space should be freed here. However,
- * if a truncate invalidates the page (by clearing PageDirty)
- * and the page is accounted for while allocating extent
- * in btrfs_check_data_free_space() we let delayed_ref to
- * free the entire extent.
+ * 1) Already written to disk or ordered extent already submitted
+ * Then its QGROUP_RESERVED bit in io_tree is already cleaned.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED
+ * bit of its io_tree, and free the qgroup reserved data space.
+ * Since the IO will never happen for this page.
*/
- if (PageDirty(page))
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
@@ -8186,8 +8239,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
- reserved_space);
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ page_start, reserved_space);
if (!ret2) {
ret2 = file_update_time(vmf->vma->vm_file);
reserved = 1;
@@ -8234,9 +8287,9 @@ again:
fs_info->sectorsize);
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(inode, data_reserved,
- page_start, PAGE_SIZE - reserved_space,
- true);
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, page_start,
+ PAGE_SIZE - reserved_space, true);
}
}
@@ -8251,7 +8304,7 @@ again:
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 0, 0, &cached_state);
- ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
&cached_state);
if (ret2) {
unlock_extent_cached(io_tree, page_start, page_end,
@@ -8291,7 +8344,7 @@ out_unlock:
unlock_page(page);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
sb_end_pagefault(inode->i_sb);
@@ -8505,6 +8558,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+ ei->last_reflink_trans = 0;
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
@@ -8591,7 +8645,7 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
- btrfs_qgroup_check_reserved_leak(inode);
+ btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
inode_tree_del(inode);
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
@@ -9573,6 +9627,31 @@ out_unlock:
return err;
}
+static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct btrfs_key *ins,
+ u64 file_offset)
+{
+ struct btrfs_file_extent_item stack_fi;
+ u64 start = ins->objectid;
+ u64 len = ins->offset;
+ int ret;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
+ if (ret < 0)
+ return ret;
+ return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset,
+ &stack_fi, ret);
+}
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
@@ -9631,11 +9710,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- ret = insert_reserved_file_extent(trans, inode,
- cur_offset, ins.objectid,
- ins.offset, ins.offset,
- ins.offset, 0, 0, 0,
- BTRFS_FILE_EXTENT_PREALLOC);
+ ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
if (ret) {
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
@@ -9708,7 +9783,7 @@ next:
btrfs_end_transaction(trans);
}
if (clear_offset < end)
- btrfs_free_reserved_data_space(inode, NULL, clear_offset,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
end - clear_offset + 1);
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 168deb8ef68a..bd3511c5ca81 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -164,8 +164,11 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
return 0;
}
-/* Check if @flags are a supported and valid set of FS_*_FL flags */
-static int check_fsflags(unsigned int flags)
+/*
+ * Check if @flags are a supported and valid set of FS_*_FL flags and that
+ * the old and new flags are not conflicting
+ */
+static int check_fsflags(unsigned int old_flags, unsigned int flags)
{
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
@@ -174,9 +177,19 @@ static int check_fsflags(unsigned int flags)
FS_NOCOW_FL))
return -EOPNOTSUPP;
+ /* COMPR and NOCOMP on new/old are valid */
if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
return -EINVAL;
+ if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
+ return -EINVAL;
+
+ /* NOCOW and compression options are mutually exclusive */
+ if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+ if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+
return 0;
}
@@ -190,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
- u32 binode_flags = binode->flags;
+ u32 binode_flags;
if (!inode_owner_or_capable(inode))
return -EPERM;
@@ -201,22 +214,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
return -EFAULT;
- ret = check_fsflags(fsflags);
- if (ret)
- return ret;
-
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(inode);
-
fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+
ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
if (ret)
goto out_unlock;
+ ret = check_fsflags(old_fsflags, fsflags);
+ if (ret)
+ goto out_unlock;
+
+ binode_flags = binode->flags;
if (fsflags & FS_SYNC_FL)
binode_flags |= BTRFS_INODE_SYNC;
else
@@ -566,6 +580,7 @@ static noinline int create_subvol(struct inode *dir,
struct inode *inode;
int ret;
int err;
+ dev_t anon_dev = 0;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
@@ -578,6 +593,10 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail_free;
+ ret = get_anon_bdev(&anon_dev);
+ if (ret < 0)
+ goto fail_free;
+
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assumes subvolume qgroup's level to be 0.
@@ -660,12 +679,15 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
key.offset = (u64)-1;
- new_root = btrfs_get_fs_root(fs_info, objectid, true);
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) {
+ free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
goto fail;
}
+ /* Freeing will be done in btrfs_put_root() of new_root */
+ anon_dev = 0;
btrfs_record_root_in_trans(trans, new_root);
@@ -735,6 +757,8 @@ fail:
return ret;
fail_free:
+ if (anon_dev)
+ free_anon_bdev(anon_dev);
kfree(root_item);
return ret;
}
@@ -762,6 +786,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!pending_snapshot)
return -ENOMEM;
+ ret = get_anon_bdev(&pending_snapshot->anon_dev);
+ if (ret < 0)
+ goto free_pending;
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
GFP_KERNEL);
pending_snapshot->path = btrfs_alloc_path();
@@ -823,10 +850,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
d_instantiate(dentry, inode);
ret = 0;
+ pending_snapshot->anon_dev = 0;
fail:
+ /* Prevent double freeing of anon_dev */
+ if (ret && pending_snapshot->snap)
+ pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
free_pending:
+ if (pending_snapshot->anon_dev)
+ free_anon_bdev(pending_snapshot->anon_dev);
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
kfree(pending_snapshot);
@@ -1243,7 +1276,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
if (ret)
@@ -1265,7 +1298,7 @@ again:
while (1) {
lock_extent_bits(tree, page_start, page_end,
&cached_state);
- ordered = btrfs_lookup_ordered_extent(inode,
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
page_start);
unlock_extent_cached(tree, page_start, page_end,
&cached_state);
@@ -1333,7 +1366,7 @@ again:
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start_index << PAGE_SHIFT,
(page_cnt - i_done) << PAGE_SHIFT, true);
}
@@ -1361,7 +1394,7 @@ out:
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
@@ -2692,7 +2725,7 @@ out:
btrfs_put_root(root);
out_free:
btrfs_free_path(path);
- kzfree(subvol_info);
+ kfree(subvol_info);
return ret;
}
@@ -3198,11 +3231,15 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u64 flags_in;
int ret = 0;
- fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
- if (!fi_args)
- return -ENOMEM;
+ fi_args = memdup_user(arg, sizeof(*fi_args));
+ if (IS_ERR(fi_args))
+ return PTR_ERR(fi_args);
+
+ flags_in = fi_args->flags;
+ memset(fi_args, 0, sizeof(*fi_args));
rcu_read_lock();
fi_args->num_devices = fs_devices->num_devices;
@@ -3218,6 +3255,23 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
+ if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
+ fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
+ fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
+ fi_args->generation = fs_info->generation;
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
+ memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
+ sizeof(fi_args->metadata_uuid));
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
+ }
+
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e13b3d28c063..ebac13389e7e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -15,6 +15,7 @@
#include "disk-io.h"
#include "compression.h"
#include "delalloc-space.h"
+#include "qgroup.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -152,23 +153,39 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
-/* allocate and add a new ordered_extent into the per-inode tree.
+/*
+ * Allocate and add a new ordered_extent into the per-inode tree.
*
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
-static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type, int dio,
int compress_type)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
+ int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
+ if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
+ /* For nocow write, we can release the qgroup rsv right now */
+ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+ } else {
+ /*
+ * The ordered extent has reserved qgroup space, release now
+ * and pass the reserved number for qgroup_record to free.
+ */
+ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
+ if (ret < 0)
+ return ret;
+ }
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
if (!entry)
return -ENOMEM;
@@ -178,9 +195,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->num_bytes = num_bytes;
entry->disk_num_bytes = disk_num_bytes;
entry->bytes_left = num_bytes;
- entry->inode = igrab(inode);
+ entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
+ entry->qgroup_rsv = ret;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -197,10 +215,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
- INIT_LIST_HEAD(&entry->log_list);
- INIT_LIST_HEAD(&entry->trans_list);
- trace_btrfs_ordered_extent_add(inode, entry);
+ trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry);
spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
@@ -228,14 +244,14 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
* that work has been done at higher layers, so this is truly the
* smallest the extent is going to get.
*/
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, 1);
+ spin_unlock(&inode->lock);
return 0;
}
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type)
{
@@ -244,7 +260,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
BTRFS_COMPRESS_NONE);
}
-int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type)
{
@@ -253,7 +269,7 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
BTRFS_COMPRESS_NONE);
}
-int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type,
int compress_type)
@@ -291,12 +307,12 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
* file_offset is updated to one byte past the range that is recorded as
* complete. This allows you to walk forward in the file.
*/
-int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 *file_offset, u64 io_size, int uptodate)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
int ret;
@@ -305,7 +321,6 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
u64 dec_start;
u64 to_dec;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
if (!node) {
@@ -429,8 +444,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
- ASSERT(list_empty(&entry->log_list));
- ASSERT(list_empty(&entry->trans_list));
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
@@ -698,14 +711,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
* find an ordered extent corresponding to file_offset. return NULL if
* nothing is found, otherwise take a reference on the extent and return it
*/
-struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -803,7 +816,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
- ordered = btrfs_lookup_ordered_extent(inode, offset);
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset);
if (!ordered)
return 0;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c01c9698250b..d61ea9c880a3 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -92,6 +92,9 @@ struct btrfs_ordered_extent {
/* compression algorithm */
int compress_type;
+ /* Qgroup reserved space */
+ int qgroup_rsv;
+
/* reference count */
refcount_t refs;
@@ -101,12 +104,6 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
- /* If we need to wait on this to be done */
- struct list_head log_list;
-
- /* If the transaction needs to wait on this ordered extent */
- struct list_head trans_list;
-
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -150,23 +147,23 @@ void btrfs_remove_ordered_extent(struct inode *inode,
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
-int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 *file_offset, u64 io_size,
int uptodate);
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type);
-int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type);
-int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type,
int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
-struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry, int wait);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5bd4089ad0e1..c0f350c3a0cf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -11,7 +11,6 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
-#include <linux/sizes.h>
#include "ctree.h"
#include "transaction.h"
@@ -22,6 +21,7 @@
#include "extent_io.h"
#include "qgroup.h"
#include "block-group.h"
+#include "sysfs.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -220,10 +220,12 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
return qgroup;
}
-static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
+static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
{
struct btrfs_qgroup_list *list;
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
list_del(&qgroup->dirty);
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups,
@@ -252,7 +254,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
- __del_qgroup_rb(qgroup);
+ __del_qgroup_rb(fs_info, qgroup);
return 0;
}
@@ -351,6 +353,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
/* default this to quota off, in case no status key is found */
fs_info->qgroup_flags = 0;
@@ -412,6 +417,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+
switch (found_key.type) {
case BTRFS_QGROUP_INFO_KEY: {
struct btrfs_qgroup_info_item *ptr;
@@ -500,12 +509,51 @@ out:
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ btrfs_sysfs_del_qgroups(fs_info);
}
return ret < 0 ? ret : 0;
}
/*
+ * Called in close_ctree() when quota is still enabled. This verifies we don't
+ * leak some reserved space.
+ *
+ * Return false if no reserved space is left.
+ * Return true if some reserved space is leaked.
+ */
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node;
+ bool ret = false;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return ret;
+ /*
+ * Since we're unmounting, there is no race and no need to grab qgroup
+ * lock. And here we don't go post-order to provide a more user
+ * friendly sorted result.
+ */
+ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
+ struct btrfs_qgroup *qgroup;
+ int i;
+
+ qgroup = rb_entry(node, struct btrfs_qgroup, node);
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
+ if (qgroup->rsv.values[i]) {
+ ret = true;
+ btrfs_warn(fs_info,
+ "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ i, qgroup->rsv.values[i]);
+ }
+ }
+ }
+ return ret;
+}
+
+/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
* first two are in single-threaded paths.And for the third one, we have set
* quota_root to be null with qgroup_lock held before, so it is safe to clean
@@ -519,7 +567,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
- __del_qgroup_rb(qgroup);
+ __del_qgroup_rb(fs_info, qgroup);
}
/*
* We call btrfs_free_qgroup_config() when unmounting
@@ -528,6 +576,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
*/
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
+ btrfs_sysfs_del_qgroups(fs_info);
}
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
@@ -900,6 +949,9 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
goto out;
}
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
@@ -987,6 +1039,11 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
}
ret = btrfs_next_item(tree_root, path);
if (ret < 0) {
@@ -1011,6 +1068,11 @@ out_add_root:
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
ret = btrfs_commit_transaction(trans);
trans = NULL;
@@ -1046,6 +1108,7 @@ out:
fs_info->qgroup_ulist = NULL;
if (trans)
btrfs_end_transaction(trans);
+ btrfs_sysfs_del_qgroups(fs_info);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -1398,8 +1461,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
qgroup = add_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
- if (IS_ERR(qgroup))
+ if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
+ goto out;
+ }
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -2818,6 +2884,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
unlock:
spin_unlock(&fs_info->qgroup_lock);
+ if (!ret)
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
@@ -2826,20 +2894,8 @@ out:
return ret;
}
-/*
- * Two limits to commit transaction in advance.
- *
- * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
- * For SIZE, it will be in byte unit as threshold.
- */
-#define QGROUP_FREE_RATIO 32
-#define QGROUP_FREE_SIZE SZ_32M
-static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
- const struct btrfs_qgroup *qg, u64 num_bytes)
+static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
- u64 free;
- u64 threshold;
-
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
@@ -2848,32 +2904,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
- /*
- * Even if we passed the check, it's better to check if reservation
- * for meta_pertrans is pushing us near limit.
- * If there is too much pertrans reservation or it's near the limit,
- * let's try commit transaction to free some, using transaction_kthread
- */
- if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
- BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
- free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
- threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- } else {
- free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
- threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- }
-
- /*
- * Use transaction_kthread to commit transaction, so we no
- * longer need to bother nested transaction nor lock context.
- */
- if (free < threshold)
- btrfs_commit_transaction_locksafe(fs_info);
- }
-
return true;
}
@@ -2921,7 +2951,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
+ if (enforce && !qgroup_check_limits(qg, num_bytes)) {
ret = -EDQUOT;
goto out;
}
@@ -3378,28 +3408,132 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
}
}
+#define rbtree_iterate_from_safe(node, next, start) \
+ for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
+
+static int qgroup_unreserve_range(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len)
+{
+ struct rb_node *node;
+ struct rb_node *next;
+ struct ulist_node *entry = NULL;
+ int ret = 0;
+
+ node = reserved->range_changed.root.rb_node;
+ while (node) {
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ if (entry->val < start)
+ node = node->rb_right;
+ else if (entry)
+ node = node->rb_left;
+ else
+ break;
+ }
+
+ /* Empty changeset */
+ if (!entry)
+ return 0;
+
+ if (entry->val > start && rb_prev(&entry->rb_node))
+ entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
+ rb_node);
+
+ rbtree_iterate_from_safe(node, next, &entry->rb_node) {
+ u64 entry_start;
+ u64 entry_end;
+ u64 entry_len;
+ int clear_ret;
+
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ entry_start = entry->val;
+ entry_end = entry->aux;
+ entry_len = entry_end - entry_start + 1;
+
+ if (entry_start >= start + len)
+ break;
+ if (entry_start + entry_len <= start)
+ continue;
+ /*
+ * Now the entry is in [start, start + len), revert the
+ * EXTENT_QGROUP_RESERVED bit.
+ */
+ clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
+ entry_end, EXTENT_QGROUP_RESERVED);
+ if (!ret && clear_ret < 0)
+ ret = clear_ret;
+
+ ulist_del(&reserved->range_changed, entry->val, entry->aux);
+ if (likely(reserved->bytes_changed >= entry_len)) {
+ reserved->bytes_changed -= entry_len;
+ } else {
+ WARN_ON(1);
+ reserved->bytes_changed = 0;
+ }
+ }
+
+ return ret;
+}
+
/*
- * Reserve qgroup space for range [start, start + len).
+ * Try to free some space for qgroup.
*
- * This function will either reserve space from related qgroups or doing
- * nothing if the range is already reserved.
+ * For qgroup, there are only 3 ways to free qgroup space:
+ * - Flush nodatacow write
+ * Any nodatacow write will free its reserved data space at run_delalloc_range().
+ * In theory, we should only flush nodatacow inodes, but it's not yet
+ * possible, so we need to flush the whole root.
*
- * Return 0 for successful reserve
- * Return <0 for error (including -EQUOT)
+ * - Wait for ordered extents
+ * When ordered extents are finished, their reserved metadata is finally
+ * converted to per_trans status, which can be freed by later commit
+ * transaction.
*
- * NOTE: this function may sleep for memory allocation.
- * if btrfs_qgroup_reserve_data() is called multiple times with
- * same @reserved, caller must ensure when error happens it's OK
- * to free *ALL* reserved space.
+ * - Commit transaction
+ * This would free the meta_per_trans space.
+ * In theory this shouldn't provide much space, but any more qgroup space
+ * is needed.
*/
-int btrfs_qgroup_reserve_data(struct inode *inode,
+static int try_flush_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /*
+ * We don't want to run flush again and again, so if there is a running
+ * one, we won't try to start a new flush, but exit directly.
+ */
+ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
+ wait_event(root->qgroup_flush_wait,
+ !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
+ return 0;
+ }
+
+ ret = btrfs_start_delalloc_snapshot(root);
+ if (ret < 0)
+ goto out;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = btrfs_commit_transaction(trans);
+out:
+ clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
+ wake_up(&root->qgroup_flush_wait);
+ return ret;
+}
+
+static int qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ struct btrfs_root *root = inode->root;
struct extent_changeset *reserved;
+ bool new_reserved = false;
u64 orig_reserved;
u64 to_reserve;
int ret;
@@ -3412,6 +3546,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
if (WARN_ON(!reserved_ret))
return -EINVAL;
if (!*reserved_ret) {
+ new_reserved = true;
*reserved_ret = extent_changeset_alloc();
if (!*reserved_ret)
return -ENOMEM;
@@ -3419,15 +3554,15 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
- ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ ret = set_record_extent_bits(&inode->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
- trace_btrfs_qgroup_reserve_data(inode, start, len,
+ trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
to_reserve, QGROUP_RESERVE);
if (ret < 0)
- goto cleanup;
+ goto out;
ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
if (ret < 0)
goto cleanup;
@@ -3435,23 +3570,49 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
return ret;
cleanup:
- /* cleanup *ALL* already reserved ranges */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(&reserved->range_changed, &uiter)))
- clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
- unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
- /* Also free data bytes of already reserved one */
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
- orig_reserved, BTRFS_QGROUP_RSV_DATA);
- extent_changeset_release(reserved);
+ qgroup_unreserve_range(inode, reserved, start, len);
+out:
+ if (new_reserved) {
+ extent_changeset_release(reserved);
+ kfree(reserved);
+ *reserved_ret = NULL;
+ }
return ret;
}
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or do nothing
+ * if the range is already reserved.
+ *
+ * Return 0 for successful reservation
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: This function may sleep for memory allocation, dirty page flushing and
+ * commit transaction. So caller should not hold any dirty page locked.
+ */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+ struct extent_changeset **reserved_ret, u64 start,
+ u64 len)
+{
+ int ret;
+
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(inode->root);
+ if (ret < 0)
+ return ret;
+ return qgroup_reserve_data(inode, reserved_ret, start, len);
+}
+
/* Free ranges specified by @reserved, normally in error path */
-static int qgroup_free_reserved_data(struct inode *inode,
+static int qgroup_free_reserved_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct extent_changeset changeset;
@@ -3487,8 +3648,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
- free_start, free_start + free_len - 1,
+ ret = clear_record_extent_bits(&inode->io_tree, free_start,
+ free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -3502,7 +3663,7 @@ out:
return ret;
}
-static int __btrfs_qgroup_release_data(struct inode *inode,
+static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len,
int free)
{
@@ -3510,8 +3671,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
- &BTRFS_I(inode)->root->fs_info->flags))
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
return 0;
/* In release case, we shouldn't have @reserved */
@@ -3519,18 +3679,18 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len);
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
+ ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
+ EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
if (free)
trace_op = QGROUP_FREE;
- trace_btrfs_qgroup_release_data(inode, start, len,
+ trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
changeset.bytes_changed, trace_op);
if (free)
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
@@ -3550,7 +3710,7 @@ out:
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_free_data(struct inode *inode,
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
@@ -3571,7 +3731,7 @@ int btrfs_qgroup_free_data(struct inode *inode,
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
@@ -3616,7 +3776,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
return num_bytes;
}
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3643,6 +3803,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return ret;
}
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
+{
+ int ret;
+
+ ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(root);
+ if (ret < 0)
+ return ret;
+ return qgroup_reserve_meta(root, num_bytes, type, enforce);
+}
+
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3742,7 +3917,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
* Check qgroup reserved space leaking, normally at destroy inode
* time
*/
-void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
{
struct extent_changeset changeset;
struct ulist_node *unode;
@@ -3750,19 +3925,19 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
int ret;
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
ULIST_ITER_INIT(&iter);
while ((unode = ulist_next(&changeset.range_changed, &iter))) {
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
- "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
- inode->i_ino, unode->val, unode->aux);
+ btrfs_warn(inode->root->fs_info,
+ "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
+ btrfs_ino(inode), unode->val, unode->aux);
}
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 1bc654459469..50dea9a2d8fb 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -8,6 +8,7 @@
#include <linux/spinlock.h>
#include <linux/rbtree.h>
+#include <linux/kobject.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -223,8 +224,18 @@ struct btrfs_qgroup {
*/
u64 old_refcnt;
u64 new_refcnt;
+
+ /*
+ * Sysfs kobjectid
+ */
+ struct kobject kobj;
};
+static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
+{
+ return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
+}
+
/*
* For qgroup event trace points only
*/
@@ -344,12 +355,12 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
#endif
/* New io_tree based accurate qgroup reserve API */
-int btrfs_qgroup_reserve_data(struct inode *inode,
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
-int btrfs_qgroup_free_data(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len);
-
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
/* Reserve metadata space for pertrans and prealloc type */
@@ -399,7 +410,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
*/
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
-void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
void btrfs_qgroup_init_swapped_blocks(
@@ -415,5 +426,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index c870ef70f817..255490f42b5d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1083,7 +1083,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
unsigned long bio_max_len)
{
struct bio *last = bio_list->tail;
- u64 last_end = 0;
int ret;
struct bio *bio;
struct btrfs_bio_stripe *stripe;
@@ -1098,15 +1097,14 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* see if we can add this page onto our existing bio */
if (last) {
- last_end = (u64)last->bi_iter.bi_sector << 9;
+ u64 last_end = (u64)last->bi_iter.bi_sector << 9;
last_end += last->bi_iter.bi_size;
/*
* we can't merge these if they are from different
* devices or if they are not contiguous
*/
- if (last_end == disk_start && stripe->dev->bdev &&
- !last->bi_status &&
+ if (last_end == disk_start && !last->bi_status &&
last->bi_disk == stripe->dev->bdev->bd_disk &&
last->bi_partno == stripe->dev->bdev->bd_partno) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
@@ -1117,6 +1115,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* put a new bio on the list */
bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+ btrfs_io_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1325,11 +1324,7 @@ write_data:
atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
@@ -1354,7 +1349,6 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
u64 physical = bio->bi_iter.bi_sector;
- u64 stripe_start;
int i;
struct btrfs_bio_stripe *stripe;
@@ -1362,9 +1356,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->bbio->num_stripes; i++) {
stripe = &rbio->bbio->stripes[i];
- stripe_start = stripe->physical;
- if (physical >= stripe_start &&
- physical < stripe_start + rbio->stripe_len &&
+ if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev &&
bio->bi_disk == stripe->dev->bdev->bd_disk &&
bio->bi_partno == stripe->dev->bdev->bd_partno) {
@@ -1382,18 +1374,14 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
- u64 logical = bio->bi_iter.bi_sector;
- u64 stripe_start;
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
int i;
- logical <<= 9;
-
for (i = 0; i < rbio->nr_data; i++) {
- stripe_start = rbio->bbio->raid_map[i];
- if (logical >= stripe_start &&
- logical < stripe_start + rbio->stripe_len) {
+ u64 stripe_start = rbio->bbio->raid_map[i];
+
+ if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
- }
}
return -1;
}
@@ -1567,11 +1555,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
@@ -1878,11 +1862,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* make sure our ps and qs are in order */
- if (faila > failb) {
- int tmp = failb;
- failb = faila;
- faila = tmp;
- }
+ if (faila > failb)
+ swap(faila, failb);
/* if the q stripe is failed, do a pstripe reconstruction
* from the xors.
@@ -2102,7 +2083,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
*/
if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
__raid_recover_end_io(rbio);
- goto out;
+ return 0;
} else {
goto cleanup;
}
@@ -2113,11 +2094,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
@@ -2126,7 +2103,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
submit_bio(bio);
}
-out:
+
return 0;
cleanup:
@@ -2482,11 +2459,7 @@ submit_write:
atomic_set(&rbio->stripes_pending, nr_data);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
@@ -2664,11 +2637,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 7887317033c9..7f03dbe5b609 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -286,6 +286,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
exist_re = insert_root_entry(&exist->roots, re);
if (exist_re)
kfree(re);
+ } else {
+ kfree(re);
}
kfree(be);
return exist;
@@ -509,7 +511,7 @@ static int process_leaf(struct btrfs_root *root,
switch (key.type) {
case BTRFS_EXTENT_ITEM_KEY:
*num_bytes = key.offset;
- /* fall through */
+ fallthrough;
case BTRFS_METADATA_ITEM_KEY:
*bytenr = key.objectid;
ret = process_extent_item(fs_info, path, &key, i,
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 040009d1cc31..5cd02514cf4d 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -68,8 +68,8 @@ static int copy_inline_to_page(struct inode *inode,
* reservation here. Also we must not do the reservation while holding
* a transaction open, otherwise we would deadlock.
*/
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
- block_size);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ file_offset, block_size);
if (ret)
goto out;
@@ -84,7 +84,8 @@ static int copy_inline_to_page(struct inode *inode,
clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
- ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end,
+ 0, NULL);
if (ret)
goto out_unlock;
@@ -133,8 +134,8 @@ out_unlock:
put_page(page);
}
if (ret)
- btrfs_delalloc_release_space(inode, data_reserved, file_offset,
- block_size, true);
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
+ file_offset, block_size, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
out:
extent_changeset_free(data_reserved);
@@ -336,6 +337,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
while (1) {
u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
+ u64 extent_gen;
int type;
u32 size;
struct btrfs_key new_key;
@@ -384,6 +386,7 @@ process_slot:
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
+ extent_gen = btrfs_file_extent_generation(leaf, extent);
comp = btrfs_file_extent_compression(leaf, extent);
type = btrfs_file_extent_type(leaf, extent);
if (type == BTRFS_FILE_EXTENT_REG ||
@@ -488,6 +491,19 @@ process_slot:
btrfs_release_path(path);
+ /*
+ * If this is a new extent update the last_reflink_trans of both
+ * inodes. This is used by fsync to make sure it does not log
+ * multiple checksum items with overlapping ranges. For older
+ * extents we don't need to do it since inode logging skips the
+ * checksums for older extents. Also ignore holes and inline
+ * extents because they don't have checksums in the csum tree.
+ */
+ if (extent_gen == trans->transid && disko > 0) {
+ BTRFS_I(src)->last_reflink_trans = trans->transid;
+ BTRFS_I(inode)->last_reflink_trans = trans->transid;
+ }
+
last_dest_end = ALIGN(new_key.offset + datal,
fs_info->sectorsize);
ret = clone_finish_inode_update(trans, inode, last_dest_end,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3bbae80c752f..4ba1ab9cc76d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1686,12 +1686,20 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_unlock_up_safe(path, 0);
}
- min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ /*
+ * In merge_reloc_root(), we modify the upper level pointer to swap the
+ * tree blocks between reloc tree and subvolume tree. Thus for tree
+ * block COW, we COW at most from level 1 to root level for each tree.
+ *
+ * Thus the needed metadata size is at most root_level * nodesize,
+ * and * 2 since we have two trees to COW.
+ */
+ min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {
ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
- BTRFS_RESERVE_FLUSH_ALL);
+ BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
err = ret;
goto out;
@@ -2571,58 +2579,50 @@ out_free_blocks:
return err;
}
-static noinline_for_stack
-int prealloc_file_extent_cluster(struct inode *inode,
- struct file_extent_cluster *cluster)
+static noinline_for_stack int prealloc_file_extent_cluster(
+ struct btrfs_inode *inode,
+ struct file_extent_cluster *cluster)
{
u64 alloc_hint = 0;
u64 start;
u64 end;
- u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 offset = inode->index_cnt;
u64 num_bytes;
- int nr = 0;
+ int nr;
int ret = 0;
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
- u64 cur_offset;
- struct extent_changeset *data_reserved = NULL;
+ u64 cur_offset = prealloc_start;
BUG_ON(cluster->start != cluster->boundary[0]);
- inode_lock(inode);
-
- ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
- prealloc_end + 1 - prealloc_start);
+ ret = btrfs_alloc_data_chunk_ondemand(inode,
+ prealloc_end + 1 - prealloc_start);
if (ret)
- goto out;
+ return ret;
- cur_offset = prealloc_start;
- while (nr < cluster->nr) {
+ inode_lock(&inode->vfs_inode);
+ for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
end = cluster->boundary[nr + 1] - 1 - offset;
else
end = cluster->end - offset;
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ lock_extent(&inode->io_tree, start, end);
num_bytes = end + 1 - start;
- if (cur_offset < start)
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, start - cur_offset);
- ret = btrfs_prealloc_file_range(inode, 0, start,
+ ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end);
if (ret)
break;
- nr++;
}
+ inode_unlock(&inode->vfs_inode);
+
if (cur_offset < prealloc_end)
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, prealloc_end + 1 - cur_offset);
-out:
- inode_unlock(inode);
- extent_changeset_free(data_reserved);
+ btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
+ prealloc_end + 1 - cur_offset);
return ret;
}
@@ -2664,7 +2664,8 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
*/
int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
{
- return atomic_read(&fs_info->balance_cancel_req);
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ fatal_signal_pending(current);
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
@@ -2690,7 +2691,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
- ret = prealloc_file_extent_cluster(inode, cluster);
+ ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
if (ret)
goto out;
@@ -2762,8 +2763,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
nr++;
}
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
- NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
+ page_end, 0, NULL);
if (ret) {
unlock_page(page);
put_page(page);
@@ -3872,9 +3873,9 @@ out:
* cloning checksum properly handles the nodatasum extents.
* it also saves CPU time to re-calculate the checksum.
*/
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered;
int ret;
@@ -3885,7 +3886,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+ disk_bytenr = file_pos + inode->index_cnt;
ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
if (ret)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 016a025e36c7..5a6cb9db512e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1616,13 +1616,9 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
struct scrub_page *spage = sblock->pagev[page_num];
BUG_ON(spage->page == NULL);
- if (spage->io_error) {
- void *mapped_buffer = kmap_atomic(spage->page);
+ if (spage->io_error)
+ clear_page(page_address(spage->page));
- clear_page(mapped_buffer);
- flush_dcache_page(spage->page);
- kunmap_atomic(mapped_buffer);
- }
return scrub_add_page_to_wr_bio(sblock->sctx, spage);
}
@@ -1790,42 +1786,21 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
- u8 *on_disk_csum;
- struct page *page;
- void *buffer;
- u64 len;
- int index;
+ struct scrub_page *spage;
+ char *kaddr;
BUG_ON(sblock->page_count < 1);
- if (!sblock->pagev[0]->have_csum)
+ spage = sblock->pagev[0];
+ if (!spage->have_csum)
return 0;
+ kaddr = page_address(spage->page);
+
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
+ crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
- on_disk_csum = sblock->pagev[0]->csum;
- page = sblock->pagev[0]->page;
- buffer = kmap_atomic(page);
-
- len = sctx->fs_info->sectorsize;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, PAGE_SIZE);
-
- crypto_shash_update(shash, buffer, l);
- kunmap_atomic(buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- buffer = kmap_atomic(page);
- }
-
- crypto_shash_final(shash, csum);
- if (memcmp(csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(csum, spage->csum, sctx->csum_size))
sblock->checksum_error = 1;
return sblock->checksum_error;
@@ -1839,20 +1814,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
- struct page *page;
- void *mapped_buffer;
- u64 mapped_size;
- void *p;
- u64 len;
- int index;
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
+ const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
+ int i;
+ struct scrub_page *spage;
+ char *kaddr;
BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0]->page;
- mapped_buffer = kmap_atomic(page);
- h = (struct btrfs_header *)mapped_buffer;
+ spage = sblock->pagev[0];
+ kaddr = page_address(spage->page);
+ h = (struct btrfs_header *)kaddr;
memcpy(on_disk_csum, h->csum, sctx->csum_size);
/*
@@ -1860,40 +1830,29 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
+ if (spage->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+ if (spage->generation != btrfs_stack_header_generation(h)) {
sblock->header_error = 1;
sblock->generation_error = 1;
}
- if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
+ if (!scrub_check_fsid(h->fsid, spage))
sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
sblock->header_error = 1;
- len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, mapped_size);
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+ PAGE_SIZE - BTRFS_CSUM_SIZE);
- crypto_shash_update(shash, p, l);
- kunmap_atomic(mapped_buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- mapped_buffer = kmap_atomic(page);
- mapped_size = PAGE_SIZE;
- p = mapped_buffer;
+ for (i = 1; i < num_pages; i++) {
+ kaddr = page_address(sblock->pagev[i]->page);
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
crypto_shash_final(shash, calculated_csum);
@@ -1910,57 +1869,31 @@ static int scrub_checksum_super(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
- u8 on_disk_csum[BTRFS_CSUM_SIZE];
- struct page *page;
- void *mapped_buffer;
- u64 mapped_size;
- void *p;
+ struct scrub_page *spage;
+ char *kaddr;
int fail_gen = 0;
int fail_cor = 0;
- u64 len;
- int index;
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0]->page;
- mapped_buffer = kmap_atomic(page);
- s = (struct btrfs_super_block *)mapped_buffer;
- memcpy(on_disk_csum, s->csum, sctx->csum_size);
+ spage = sblock->pagev[0];
+ kaddr = page_address(spage->page);
+ s = (struct btrfs_super_block *)kaddr;
- if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
+ if (spage->logical != btrfs_super_bytenr(s))
++fail_cor;
- if (sblock->pagev[0]->generation != btrfs_super_generation(s))
+ if (spage->generation != btrfs_super_generation(s))
++fail_gen;
- if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
+ if (!scrub_check_fsid(s->fsid, spage))
++fail_cor;
- len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, mapped_size);
-
- crypto_shash_update(shash, p, l);
- kunmap_atomic(mapped_buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- mapped_buffer = kmap_atomic(page);
- mapped_size = PAGE_SIZE;
- p = mapped_buffer;
- }
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
- crypto_shash_final(shash, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(calculated_csum, s->csum, sctx->csum_size))
++fail_cor;
if (fail_cor + fail_gen) {
@@ -1973,10 +1906,10 @@ static int scrub_checksum_super(struct scrub_block *sblock)
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
if (fail_cor)
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+ btrfs_dev_stat_inc_and_print(spage->dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
else
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+ btrfs_dev_stat_inc_and_print(spage->dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
}
@@ -3758,7 +3691,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
- return -EIO;
+ return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
if (scrub_dev->fs_devices != fs_info->fs_devices)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 41ee88633769..475968ccbd1d 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -468,8 +468,8 @@ again:
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
cache->start, cache->length, cache->used, cache->pinned,
cache->reserved, cache->ro ? "[readonly]" : "");
- btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
+ btrfs_dump_free_space(cache, bytes);
}
if (++index < BTRFS_NR_RAID_TYPES)
goto again;
@@ -879,8 +879,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
return false;
}
global_rsv->reserved -= ticket->bytes;
+ remove_ticket(space_info, ticket);
ticket->bytes = 0;
- list_del_init(&ticket->list);
wake_up(&ticket->wait);
space_info->tickets_id++;
if (global_rsv->reserved < global_rsv->size)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index bc73fd670702..5a9dc31d95c9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -67,6 +67,21 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+/*
+ * Generally the error codes correspond to their respective errors, but there
+ * are a few special cases.
+ *
+ * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
+ * instance will return EUCLEAN if any of the blocks are corrupted in
+ * a way that is problematic. We want to reserve EUCLEAN for these
+ * sort of corruptions.
+ *
+ * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
+ * need to use EROFS for this case. We will have no idea of the
+ * original failure, that will have been reported at the time we tripped
+ * over the error. Each subsequent error that doesn't have any context
+ * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
+ */
const char * __attribute_const__ btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -326,7 +341,6 @@ enum {
Opt_defrag, Opt_nodefrag,
Opt_discard, Opt_nodiscard,
Opt_discard_mode,
- Opt_nologreplay,
Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
@@ -340,13 +354,15 @@ enum {
Opt_subvolid,
Opt_thread_pool,
Opt_treelog, Opt_notreelog,
- Opt_usebackuproot,
Opt_user_subvol_rm_allowed,
+ /* Rescue options */
+ Opt_rescue,
+ Opt_usebackuproot,
+ Opt_nologreplay,
+
/* Deprecated options */
- Opt_alloc_start,
Opt_recovery,
- Opt_subvolrootid,
/* Debugging options */
Opt_check_integrity,
@@ -390,7 +406,6 @@ static const match_table_t tokens = {
{Opt_discard, "discard"},
{Opt_discard_mode, "discard=%s"},
{Opt_nodiscard, "nodiscard"},
- {Opt_nologreplay, "nologreplay"},
{Opt_norecovery, "norecovery"},
{Opt_ratio, "metadata_ratio=%u"},
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
@@ -408,13 +423,17 @@ static const match_table_t tokens = {
{Opt_thread_pool, "thread_pool=%u"},
{Opt_treelog, "treelog"},
{Opt_notreelog, "notreelog"},
- {Opt_usebackuproot, "usebackuproot"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+ /* Rescue options */
+ {Opt_rescue, "rescue=%s"},
+ /* Deprecated, with alias rescue=nologreplay */
+ {Opt_nologreplay, "nologreplay"},
+ /* Deprecated, with alias rescue=usebackuproot */
+ {Opt_usebackuproot, "usebackuproot"},
+
/* Deprecated options */
- {Opt_alloc_start, "alloc_start=%s"},
{Opt_recovery, "recovery"},
- {Opt_subvolrootid, "subvolrootid=%d"},
/* Debugging options */
{Opt_check_integrity, "check_int"},
@@ -433,6 +452,55 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
+static const match_table_t rescue_tokens = {
+ {Opt_usebackuproot, "usebackuproot"},
+ {Opt_nologreplay, "nologreplay"},
+ {Opt_err, NULL},
+};
+
+static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+{
+ char *opts;
+ char *orig;
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int ret = 0;
+
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ":")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+ token = match_token(p, rescue_tokens, args);
+ switch (token){
+ case Opt_usebackuproot:
+ btrfs_info(info,
+ "trying to use backup root at mount time");
+ btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ break;
+ case Opt_nologreplay:
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
+ case Opt_err:
+ btrfs_info(info, "unrecognized rescue option '%s'", p);
+ ret = -EINVAL;
+ goto out;
+ default:
+ break;
+ }
+
+ }
+out:
+ kfree(orig);
+ return ret;
+}
+
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
@@ -479,7 +547,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_subvol:
case Opt_subvol_empty:
case Opt_subvolid:
- case Opt_subvolrootid:
case Opt_device:
/*
* These are parsed by btrfs_parse_subvol_options or
@@ -523,7 +590,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_compress_force:
case Opt_compress_force_type:
compress_force = true;
- /* Fallthrough */
+ fallthrough;
case Opt_compress:
case Opt_compress_type:
saved_compress_type = btrfs_test_opt(info,
@@ -622,7 +689,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, NOSSD);
btrfs_clear_and_info(info, SSD,
"not using ssd optimizations");
- /* Fallthrough */
+ fallthrough;
case Opt_nossd_spread:
btrfs_clear_and_info(info, SSD_SPREAD,
"not using spread ssd allocation scheme");
@@ -663,10 +730,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
goto out;
}
break;
- case Opt_alloc_start:
- btrfs_info(info,
- "option alloc_start is obsolete, ignored");
- break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
info->sb->s_flags |= SB_POSIXACL;
@@ -689,6 +752,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_norecovery:
case Opt_nologreplay:
+ btrfs_warn(info,
+ "'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
@@ -762,6 +827,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
break;
case Opt_inode_cache:
+ btrfs_warn(info,
+ "the 'inode_cache' option is deprecated and will have no effect from 5.11");
btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
"enabling inode map caching");
break;
@@ -791,10 +858,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
"disabling auto defrag");
break;
case Opt_recovery:
- btrfs_warn(info,
- "'recovery' is deprecated, use 'usebackuproot' instead");
- /* fall through */
case Opt_usebackuproot:
+ btrfs_warn(info,
+ "'%s' is deprecated, use 'rescue=usebackuproot' instead",
+ token == Opt_recovery ? "recovery" :
+ "usebackuproot");
btrfs_info(info,
"trying to use backup root at mount time");
btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
@@ -859,6 +927,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
info->commit_interval = intarg;
break;
+ case Opt_rescue:
+ ret = parse_rescue_options(info, args[0].from);
+ if (ret < 0)
+ goto out;
+ break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
btrfs_info(info, "fragmenting all space");
@@ -1020,9 +1093,6 @@ static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
*subvol_objectid = subvolid;
break;
- case Opt_subvolrootid:
- pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
- break;
default:
break;
}
@@ -1344,7 +1414,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
if (btrfs_test_opt(info, NOLOGREPLAY))
- seq_puts(seq, ",nologreplay");
+ seq_puts(seq, ",rescue=nologreplay");
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD_SYNC))
@@ -1712,11 +1782,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
new_pool_size);
}
-static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
-{
- set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
-}
-
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
unsigned long old_opts, int flags)
{
@@ -1750,8 +1815,6 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_cleanup(fs_info);
-
- clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1767,7 +1830,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
int ret;
sync_filesystem(sb);
- btrfs_remount_prepare(fs_info);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
if (data) {
void *new_sec_opts = NULL;
@@ -1889,6 +1952,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
out:
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
return 0;
restore:
@@ -1903,6 +1968,8 @@ restore:
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
btrfs_remount_cleanup(fs_info, old_opts);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
return ret;
}
@@ -2296,9 +2363,7 @@ static int btrfs_unfreeze(struct super_block *sb)
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
- struct btrfs_fs_devices *cur_devices;
struct btrfs_device *dev, *first_dev = NULL;
- struct list_head *head;
/*
* Lightweight locking of the devices. We should not need
@@ -2308,18 +2373,13 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
* least until the rcu_read_unlock.
*/
rcu_read_lock();
- cur_devices = fs_info->fs_devices;
- while (cur_devices) {
- head = &cur_devices->devices;
- list_for_each_entry_rcu(dev, head, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
- continue;
- if (!dev->name)
- continue;
- if (!first_dev || dev->devid < first_dev->devid)
- first_dev = dev;
- }
- cur_devices = cur_devices->seed;
+ list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (!dev->name)
+ continue;
+ if (!first_dev || dev->devid < first_dev->devid)
+ first_dev = dev;
}
if (first_dev)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a39bff64ff24..104c80caaa74 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -19,6 +19,7 @@
#include "volumes.h"
#include "space-info.h"
#include "block-group.h"
+#include "qgroup.h"
struct btrfs_feature_attr {
struct kobj_attribute kobj_attr;
@@ -936,8 +937,12 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+
btrfs_reset_fs_info_ptr(fs_info);
+ sysfs_remove_link(fsid_kobj, "bdi");
+
if (fs_info->space_info_kobj) {
sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
kobject_del(fs_info->space_info_kobj);
@@ -957,8 +962,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
}
#endif
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
- sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
+ sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(fsid_kobj, btrfs_attrs);
btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);
}
@@ -1273,7 +1278,9 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
{
int error = 0;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
+ nofs_flag = memalloc_nofs_save();
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
if (one_device && one_device != dev)
@@ -1301,6 +1308,7 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
break;
}
}
+ memalloc_nofs_restore(nofs_flag);
return error;
}
@@ -1438,6 +1446,10 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
if (error)
goto failure;
+ error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
+ if (error)
+ goto failure;
+
fs_info->space_info_kobj = kobject_create_and_add("allocation",
fsid_kobj);
if (!fs_info->space_info_kobj) {
@@ -1455,6 +1467,153 @@ failure:
return error;
}
+static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj)
+{
+ return to_fs_info(kobj->parent->parent);
+}
+
+#define QGROUP_ATTR(_member, _show_name) \
+static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member)
+
+#define QGROUP_RSV_ATTR(_name, _type) \
+static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->rsv.values[_type], \
+ &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name)
+
+QGROUP_ATTR(rfer, referenced);
+QGROUP_ATTR(excl, exclusive);
+QGROUP_ATTR(max_rfer, max_referenced);
+QGROUP_ATTR(max_excl, max_exclusive);
+QGROUP_ATTR(lim_flags, limit_flags);
+QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
+QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
+QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
+
+static struct attribute *qgroup_attrs[] = {
+ BTRFS_ATTR_PTR(qgroup, referenced),
+ BTRFS_ATTR_PTR(qgroup, exclusive),
+ BTRFS_ATTR_PTR(qgroup, max_referenced),
+ BTRFS_ATTR_PTR(qgroup, max_exclusive),
+ BTRFS_ATTR_PTR(qgroup, limit_flags),
+ BTRFS_ATTR_PTR(qgroup, rsv_data),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroup);
+
+static void qgroup_release(struct kobject *kobj)
+{
+ struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj);
+
+ memset(&qgroup->kobj, 0, sizeof(*kobj));
+}
+
+static struct kobj_type qgroup_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = qgroup_release,
+ .default_groups = qgroup_groups,
+};
+
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ struct kobject *qgroups_kobj = fs_info->qgroups_kobj;
+ int ret;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+ if (qgroup->kobj.state_initialized)
+ return 0;
+ if (!qgroups_kobj)
+ return -EINVAL;
+
+ ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
+ "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid));
+ if (ret < 0)
+ kobject_put(&qgroup->kobj);
+
+ return ret;
+}
+
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node)
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kobject_del(fs_info->qgroups_kobj);
+ kobject_put(fs_info->qgroups_kobj);
+ fs_info->qgroups_kobj = NULL;
+}
+
+/* Called when qgroups get initialized, thus there is no need for locking */
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+ int ret = 0;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+
+ ASSERT(fsid_kobj);
+ if (fs_info->qgroups_kobj)
+ return 0;
+
+ fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj);
+ if (!fs_info->qgroups_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node) {
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ if (ret < 0)
+ btrfs_sysfs_del_qgroups(fs_info);
+ return ret;
+}
+
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ if (qgroup->kobj.state_initialized) {
+ kobject_del(&qgroup->kobj);
+ kobject_put(&qgroup->kobj);
+ }
+}
/*
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 718a26c97833..cf839c46a131 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -36,4 +36,11 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
void btrfs_sysfs_update_devid(struct btrfs_device *device);
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
+
#endif
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 914eea5ba6a7..2c783d2f5228 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -60,8 +60,6 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
- if (i >= num_extents)
- goto invalid;
if (i >= num_extents ||
extent_start != extents[i].start ||
offset - extent_start != extents[i].length)
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 24a8c714f56c..894a63a92236 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -954,8 +954,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
- ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
- NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
+ BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -968,7 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE,
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
0, NULL);
if (ret) {
@@ -999,7 +999,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1)
+ sectorsize - 1,
0, NULL);
@@ -1017,7 +1017,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
0, NULL);
@@ -1035,7 +1035,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
@@ -1069,7 +1069,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
* Refill the hole again just for good measure, because I thought it
* might fail and I'd rather satisfy my paranoia at this point.
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b359d4b17658..20c6ac1a5de7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -937,7 +937,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (TRANS_ABORTED(trans) ||
test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
wake_up_process(info->transaction_kthread);
- err = -EIO;
+ if (TRANS_ABORTED(trans))
+ err = trans->aborted;
+ else
+ err = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -1630,7 +1633,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_get_fs_root(fs_info, objectid, true);
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
btrfs_abort_transaction(trans, ret);
@@ -2351,7 +2354,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
- clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index bf102e64bfb2..d60b055b8695 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -151,18 +151,20 @@ struct btrfs_pending_snapshot {
struct btrfs_block_rsv block_rsv;
/* extra metadata reservation for relocation */
int error;
+ /* Preallocated anonymous block device number */
+ dev_t anon_dev;
bool readonly;
struct list_head list;
};
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_trans = trans->transaction->transid;
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ inode->last_trans = trans->transaction->transid;
+ inode->last_sub_trans = inode->root->log_transid;
+ inode->last_log_commit = inode->root->last_log_commit;
+ spin_unlock(&inode->lock);
}
/*
@@ -208,20 +210,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
-
-/*
- * Try to commit transaction asynchronously, so this is safe to call
- * even holding a spinlock.
- *
- * It's done by informing transaction_kthread to commit transaction without
- * waiting for commit interval.
- */
-static inline void btrfs_commit_transaction_locksafe(
- struct btrfs_fs_info *fs_info)
-{
- set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
- wake_up_process(fs_info->transaction_kthread);
-}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 16c3a6d2586d..d3f28b8f4ff9 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -133,10 +133,9 @@ out:
ret = 0;
}
done:
- if (ret != -EAGAIN) {
+ if (ret != -EAGAIN)
memset(&root->defrag_progress, 0,
sizeof(root->defrag_progress));
- root->defrag_trans_start = trans->transid;
- }
+
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 920cee312f4e..ea8136dcf71f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -169,6 +169,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
if (ret)
goto out;
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
}
@@ -195,6 +196,9 @@ static int join_running_log_trans(struct btrfs_root *root)
{
int ret = -ENOENT;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+ return ret;
+
mutex_lock(&root->log_mutex);
if (root->log_root) {
ret = 0;
@@ -3112,29 +3116,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
- atomic_inc(&log_root_tree->log_batch);
- atomic_inc(&log_root_tree->log_writers);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
- mutex_unlock(&log_root_tree->log_mutex);
-
- mutex_lock(&log_root_tree->log_mutex);
-
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
* open until we drop the log_mutex.
*/
ret = update_log_root(trans, log, &new_root_item);
-
- if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /* atomic_dec_and_test implies a barrier */
- cond_wake_up_nomb(&log_root_tree->log_writer_wait);
- }
-
if (ret) {
if (!list_empty(&root_log_ctx.list))
list_del_init(&root_log_ctx.list);
@@ -3180,8 +3172,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
root_log_ctx.log_transid - 1);
}
- wait_for_writer(log_root_tree);
-
/*
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
@@ -3303,6 +3293,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
if (root->log_root) {
free_log_tree(trans, root->log_root);
root->log_root = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
}
return 0;
}
@@ -3901,6 +3892,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
}
static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
struct btrfs_root *log_root,
struct btrfs_ordered_sum *sums)
{
@@ -3909,6 +3901,14 @@ static int log_csums(struct btrfs_trans_handle *trans,
int ret;
/*
+ * If this inode was not used for reflink operations in the current
+ * transaction with new extents, then do the fast path, no need to
+ * worry about logging checksum items with overlapping ranges.
+ */
+ if (inode->last_reflink_trans < trans->transid)
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+
+ /*
* Serialize logging for checksums. This is to avoid racing with the
* same checksum being logged by another task that is logging another
* file which happens to refer to the same extent as well. Such races
@@ -4059,7 +4059,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = log_csums(trans, log, sums);
+ ret = log_csums(trans, inode, log, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4118,7 +4118,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = log_csums(trans, log_root, sums);
+ ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4146,7 +4146,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
em->start + em->len, NULL, 0, 1,
sizeof(*fi), &extent_inserted);
if (ret)
@@ -5118,14 +5118,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
const loff_t end,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
int err = 0;
- int ret;
+ int ret = 0;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
@@ -5161,15 +5160,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.offset = (u64)-1;
/*
- * Only run delayed items if we are a dir or a new file.
- * Otherwise commit the delayed inode only, which is needed in
- * order for the log replay code to mark inodes for link count
- * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ * Only run delayed items if we are a directory. We want to make sure
+ * all directory indexes hit the fs/subvolume tree so we can find them
+ * and figure out which index ranges have to be logged.
+ *
+ * Otherwise commit the delayed inode only if the full sync flag is set,
+ * as we want to make sure an up to date version is in the subvolume
+ * tree so copy_inode_items_to_log() / copy_items() can find it and copy
+ * it to the log tree. For a non full sync, we always log the inode item
+ * based on the in-memory struct btrfs_inode which is always up to date.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode) ||
- inode->generation > fs_info->last_trans_committed)
+ if (S_ISDIR(inode->vfs_inode.i_mode))
ret = btrfs_commit_inode_delayed_items(trans, inode);
- else
+ else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
ret = btrfs_commit_inode_delayed_inode(inode);
if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0d6e785bcb98..d7670e2a9f39 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -245,7 +245,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*
* global::fs_devs - add, remove, updates to the global list
*
- * does not protect: manipulation of the fs_devices::devices list!
+ * does not protect: manipulation of the fs_devices::devices list in general
+ * but in mount context it could be used to exclude list modifications by eg.
+ * scan ioctl
*
* btrfs_device::name - renames (write side), read is RCU
*
@@ -258,6 +260,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
+ * Is not required at mount and close times, because our device list is
+ * protected by the uuid_mutex at that point.
+ *
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
@@ -602,6 +607,11 @@ static int btrfs_free_stale_devices(const char *path,
return ret;
}
+/*
+ * This is only used on mount, and we are protected from competing things
+ * messing with our fs_devices by the uuid_mutex, thus we do not need the
+ * fs_devices->device_list_mutex here.
+ */
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
@@ -1229,8 +1239,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int ret;
lockdep_assert_held(&uuid_mutex);
+ /*
+ * The device_list_mutex cannot be taken here in case opening the
+ * underlying device takes further locks like bd_mutex.
+ *
+ * We also don't need the lock here as this is called during mount and
+ * exclusion is provided by uuid_mutex
+ */
- mutex_lock(&fs_devices->device_list_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
@@ -1238,7 +1254,6 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = open_fs_devices(fs_devices, flags, holder);
}
- mutex_unlock(&fs_devices->device_list_mutex);
return ret;
}
@@ -3231,7 +3246,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
@@ -4135,7 +4150,22 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
mutex_lock(&fs_info->balance_mutex);
if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
btrfs_info(fs_info, "balance: paused");
- else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
+ /*
+ * Balance can be canceled by:
+ *
+ * - Regular cancel request
+ * Then ret == -ECANCELED and balance_cancel_req > 0
+ *
+ * - Fatal signal to "btrfs" process
+ * Either the signal caught by wait_reserve_ticket() and callers
+ * got -EINTR, or caught by btrfs_should_cancel_balance() and
+ * got -ECANCELED.
+ * Either way, in this case balance_cancel_req = 0, and
+ * ret == -EINTR or ret == -ECANCELED.
+ *
+ * So here we only check the return value to catch canceled balance.
+ */
+ else if (ret == -ECANCELED || ret == -EINTR)
btrfs_info(fs_info, "balance: canceled");
else
btrfs_info(fs_info, "balance: ended with status: %d", ret);
@@ -5522,6 +5552,9 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
atomic_set(&bbio->error, 0);
refcount_set(&bbio->refs, 1);
+ bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
+ bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+
return bbio;
}
@@ -6144,8 +6177,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto out;
}
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
+
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ stripe_index++;
+ }
/* build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
@@ -6153,11 +6191,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 tmp;
unsigned rot;
- bbio->raid_map = (u64 *)((void *)bbio->stripes +
- sizeof(struct btrfs_bio_stripe) *
- num_alloc_stripes +
- sizeof(int) * tgtdev_indexes);
-
/* Work out the disk rotation on this stripe-set */
div_u64_rem(stripe_nr, num_stripes, &rot);
@@ -6171,25 +6204,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
bbio->raid_map[(i+rot+1) % num_stripes] =
RAID6_Q_STRIPE;
- }
-
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset +
- stripe_nr * map->stripe_len;
- bbio->stripes[i].dev =
- map->stripes[stripe_index].dev;
- stripe_index++;
+ sort_parity_stripes(bbio, num_stripes);
}
if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
- if (bbio->raid_map)
- sort_parity_stripes(bbio, num_stripes);
-
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
@@ -6261,23 +6282,18 @@ static void btrfs_end_bio(struct bio *bio)
atomic_inc(&bbio->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
- unsigned int stripe_index =
- btrfs_io_bio(bio)->stripe_index;
- struct btrfs_device *dev;
-
- BUG_ON(stripe_index >= bbio->num_stripes);
- dev = bbio->stripes[stripe_index].dev;
- if (dev->bdev) {
- if (bio_op(bio) == REQ_OP_WRITE)
- btrfs_dev_stat_inc_and_print(dev,
+ struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+
+ ASSERT(dev->bdev);
+ if (bio_op(bio) == REQ_OP_WRITE)
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev,
+ else if (!(bio->bi_opf & REQ_RAHEAD))
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
- if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev,
+ if (bio->bi_opf & REQ_PREFLUSH)
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
- }
}
}
@@ -6313,13 +6329,12 @@ static void btrfs_end_bio(struct bio *bio)
}
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
- u64 physical, int dev_nr)
+ u64 physical, struct btrfs_device *dev)
{
- struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
struct btrfs_fs_info *fs_info = bbio->fs_info;
bio->bi_private = bbio;
- btrfs_io_bio(bio)->stripe_index = dev_nr;
+ btrfs_io_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
btrfs_debug_in_rcu(fs_info,
@@ -6420,8 +6435,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
else
bio = first_bio;
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
- dev_nr);
+ submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
@@ -7029,6 +7043,19 @@ out:
return ret;
}
+static void readahead_tree_node_children(struct extent_buffer *node)
+{
+ int i;
+ const int nr_items = btrfs_header_nritems(node);
+
+ for (i = 0; i < nr_items; i++) {
+ u64 start;
+
+ start = btrfs_node_blockptr(node, i);
+ readahead_tree_block(node->fs_info, start);
+ }
+}
+
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->chunk_root;
@@ -7039,6 +7066,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
int ret;
int slot;
u64 total_dev = 0;
+ u64 last_ra_node = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7049,7 +7077,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* otherwise we don't need it.
*/
mutex_lock(&uuid_mutex);
- mutex_lock(&fs_info->chunk_mutex);
+
+ /*
+ * It is possible for mount and umount to race in such a way that
+ * we execute this code path, but open_fs_devices failed to clear
+ * total_rw_bytes. We certainly want it cleared before reading the
+ * device items, so clear it here.
+ */
+ fs_info->fs_devices->total_rw_bytes = 0;
/*
* Read all device items, and then all the chunk items. All
@@ -7064,6 +7099,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
if (ret < 0)
goto error;
while (1) {
+ struct extent_buffer *node;
+
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
@@ -7074,6 +7111,17 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
goto error;
break;
}
+ /*
+ * The nodes on level 1 are not locked but we don't need to do
+ * that during mount time as nothing else can access the tree
+ */
+ node = path->nodes[1];
+ if (node) {
+ if (last_ra_node != node->start) {
+ readahead_tree_node_children(node);
+ last_ra_node = node->start;
+ }
+ }
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
@@ -7086,7 +7134,9 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ mutex_lock(&fs_info->chunk_mutex);
ret = read_one_chunk(&found_key, leaf, chunk);
+ mutex_unlock(&fs_info->chunk_mutex);
if (ret)
goto error;
}
@@ -7116,7 +7166,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
}
ret = 0;
error:
- mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&uuid_mutex);
btrfs_free_path(path);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f067b5934c46..5eea93916fbf 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -288,7 +288,7 @@ struct btrfs_fs_devices {
*/
struct btrfs_io_bio {
unsigned int mirror_num;
- unsigned int stripe_index;
+ struct btrfs_device *device;
u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
@@ -408,7 +408,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
return BTRFS_MAP_WRITE;
default:
WARN_ON_ONCE(1);
- /* fall through */
+ fallthrough;
case REQ_OP_READ:
return BTRFS_MAP_READ;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 64fe82ec65ff..061dd202979d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -320,9 +320,8 @@ static void decrypt_bh(struct work_struct *work)
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
/* Decrypt if needed */
- if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) &&
- IS_ENCRYPTED(bh->b_page->mapping->host) &&
- S_ISREG(bh->b_page->mapping->host->i_mode)) {
+ if (uptodate &&
+ fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
if (ctx) {
@@ -3040,12 +3039,10 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
clear_buffer_write_io_error(bh);
- /*
- * from here on down, it's all bio -- do the initial mapping,
- * submit_bio -> generic_make_request may further map this bio around
- */
bio = bio_alloc(GFP_NOIO, 1);
+ fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
+
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index e7726f5f1241..3080cda9e824 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -937,7 +937,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
}
data = kmap(page);
- ret = __kernel_write(file, data, len, &pos);
+ ret = kernel_write(file, data, len, &pos);
kunmap(page);
fput(file);
if (ret != len)
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index fc98b97b396a..53588d7517b4 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -399,6 +399,10 @@ skip_rdma:
if (ses->sign)
seq_puts(m, " signed");
+ seq_printf(m, "\n\tUser: %d Cred User: %d",
+ from_kuid(&init_user_ns, ses->linux_uid),
+ from_kuid(&init_user_ns, ses->cred_uid));
+
if (ses->chan_count > 1) {
seq_printf(m, "\n\n\tExtra Channels: %zu\n",
ses->chan_count-1);
@@ -406,7 +410,7 @@ skip_rdma:
cifs_dump_channel(m, j, &ses->chans[j]);
}
- seq_puts(m, "\n\tShares:");
+ seq_puts(m, "\n\n\tShares:");
j = 0;
seq_printf(m, "\n\t%d) IPC: ", j);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c7a311d28d3d..99b3180c613a 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.27"
+#define CIFS_VERSION "2.28"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5fac34f192af..a61abde09ffe 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -5306,9 +5306,15 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
vol_info->nocase = master_tcon->nocase;
vol_info->nohandlecache = master_tcon->nohandlecache;
vol_info->local_lease = master_tcon->local_lease;
+ vol_info->no_lease = master_tcon->no_lease;
+ vol_info->resilient = master_tcon->use_resilient;
+ vol_info->persistent = master_tcon->use_persistent;
+ vol_info->handle_timeout = master_tcon->handle_timeout;
vol_info->no_linux_ext = !master_tcon->unix_ext;
+ vol_info->linux_ext = master_tcon->posix_extensions;
vol_info->sectype = master_tcon->ses->sectype;
vol_info->sign = master_tcon->ses->sign;
+ vol_info->seal = master_tcon->seal;
rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
if (rc) {
@@ -5334,10 +5340,6 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
goto out;
}
- /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
- if (tcon->posix_extensions)
- cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
-
if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, vol_info);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4fe757cfc360..be46fab4c96d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1149,20 +1149,20 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
/*
* Set the byte-range lock (posix style). Returns:
- * 1) 0, if we set the lock and don't need to request to the server;
- * 2) 1, if we need to request to the server;
- * 3) <0, if the error occurs while setting the lock.
+ * 1) <0, if the error occurs while setting the lock;
+ * 2) 0, if we set the lock and don't need to request to the server;
+ * 3) FILE_LOCK_DEFERRED, if we will wait for some other file_lock;
+ * 4) FILE_LOCK_DEFERRED + 1, if we need to request to the server.
*/
static int
cifs_posix_lock_set(struct file *file, struct file_lock *flock)
{
struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
- int rc = 1;
+ int rc = FILE_LOCK_DEFERRED + 1;
if ((flock->fl_flags & FL_POSIX) == 0)
return rc;
-try_again:
cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
@@ -1171,13 +1171,6 @@ try_again:
rc = posix_lock_file(file, flock, NULL);
up_write(&cinode->lock_sem);
- if (rc == FILE_LOCK_DEFERRED) {
- rc = wait_event_interruptible(flock->fl_wait,
- list_empty(&flock->fl_blocked_member));
- if (!rc)
- goto try_again;
- locks_delete_block(flock);
- }
return rc;
}
@@ -1652,7 +1645,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
int posix_lock_type;
rc = cifs_posix_lock_set(file, flock);
- if (!rc || rc < 0)
+ if (rc <= FILE_LOCK_DEFERRED)
return rc;
if (type & server->vals->shared_lock_type)
@@ -4336,7 +4329,8 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
break;
__SetPageLocked(page);
- if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
+ rc = add_to_page_cache_locked(page, mapping, page->index, gfp);
+ if (rc) {
__ClearPageLocked(page);
break;
}
@@ -4352,6 +4346,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
struct list_head *page_list, unsigned num_pages)
{
int rc;
+ int err = 0;
struct list_head tmplist;
struct cifsFileInfo *open_file = file->private_data;
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
@@ -4396,7 +4391,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
* the order of declining indexes. When we put the pages in
* the rdata->pages, then we want them in increasing order.
*/
- while (!list_empty(page_list)) {
+ while (!list_empty(page_list) && !err) {
unsigned int i, nr_pages, bytes, rsize;
loff_t offset;
struct page *page, *tpage;
@@ -4429,9 +4424,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
return 0;
}
- rc = readpages_get_pages(mapping, page_list, rsize, &tmplist,
+ nr_pages = 0;
+ err = readpages_get_pages(mapping, page_list, rsize, &tmplist,
&nr_pages, &offset, &bytes);
- if (rc) {
+ if (!nr_pages) {
add_credits_and_wake_if(server, credits, 0);
break;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 583f5e4008c2..ce95801e9b66 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2535,6 +2535,15 @@ set_size_out:
if (rc == 0) {
cifsInode->server_eof = attrs->ia_size;
cifs_setsize(inode, attrs->ia_size);
+
+ /*
+ * The man page of truncate says if the size changed,
+ * then the st_ctime and st_mtime fields for the file
+ * are updated.
+ */
+ attrs->ia_ctime = attrs->ia_mtime = current_time(inode);
+ attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+
cifs_truncate_page(inode->i_mapping, inode->i_size);
}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 4a73e63c4d43..dcde44ff6cf9 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -169,6 +169,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
unsigned int xid;
struct cifsFileInfo *pSMBFile = filep->private_data;
struct cifs_tcon *tcon;
+ struct tcon_link *tlink;
struct cifs_sb_info *cifs_sb;
__u64 ExtAttrBits = 0;
__u64 caps;
@@ -307,13 +308,19 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
break;
}
cifs_sb = CIFS_SB(inode->i_sb);
- tcon = tlink_tcon(cifs_sb_tlink(cifs_sb));
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ break;
+ }
+ tcon = tlink_tcon(tlink);
if (tcon && tcon->ses->server->ops->notify) {
rc = tcon->ses->server->ops->notify(xid,
filep, (void __user *)arg);
cifs_dbg(FYI, "ioctl notify rc %d\n", rc);
} else
rc = -EOPNOTSUPP;
+ cifs_put_tlink(tlink);
break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 56791a692c8b..e44d049142d0 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -844,28 +844,26 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
struct bio_vec *bv = NULL;
if (iov_iter_is_kvec(iter)) {
- memcpy(&ctx->iter, iter, sizeof(struct iov_iter));
+ memcpy(&ctx->iter, iter, sizeof(*iter));
ctx->len = count;
iov_iter_advance(iter, count);
return 0;
}
- if (max_pages * sizeof(struct bio_vec) <= CIFS_AIO_KMALLOC_LIMIT)
- bv = kmalloc_array(max_pages, sizeof(struct bio_vec),
- GFP_KERNEL);
+ if (array_size(max_pages, sizeof(*bv)) <= CIFS_AIO_KMALLOC_LIMIT)
+ bv = kmalloc_array(max_pages, sizeof(*bv), GFP_KERNEL);
if (!bv) {
- bv = vmalloc(array_size(max_pages, sizeof(struct bio_vec)));
+ bv = vmalloc(array_size(max_pages, sizeof(*bv)));
if (!bv)
return -ENOMEM;
}
- if (max_pages * sizeof(struct page *) <= CIFS_AIO_KMALLOC_LIMIT)
- pages = kmalloc_array(max_pages, sizeof(struct page *),
- GFP_KERNEL);
+ if (array_size(max_pages, sizeof(*pages)) <= CIFS_AIO_KMALLOC_LIMIT)
+ pages = kmalloc_array(max_pages, sizeof(*pages), GFP_KERNEL);
if (!pages) {
- pages = vmalloc(array_size(max_pages, sizeof(struct page *)));
+ pages = vmalloc(array_size(max_pages, sizeof(*pages)));
if (!pages) {
kvfree(bv);
return -ENOMEM;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 6a39451973f8..157992864ce7 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -354,9 +354,13 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
((struct smb2_ioctl_rsp *)shdr)->OutputCount);
break;
case SMB2_CHANGE_NOTIFY:
+ *off = le16_to_cpu(
+ ((struct smb2_change_notify_rsp *)shdr)->OutputBufferOffset);
+ *len = le32_to_cpu(
+ ((struct smb2_change_notify_rsp *)shdr)->OutputBufferLength);
+ break;
default:
- /* BB FIXME for unimplemented cases above */
- cifs_dbg(VFS, "no length check for command\n");
+ cifs_dbg(VFS, "no length check for command %d\n", le16_to_cpu(shdr->Command));
break;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 736d86b8a910..32f90dc82c84 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -763,6 +763,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
/* close extra handle outside of crit sec */
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
}
+ rc = 0;
goto oshr_free;
}
@@ -2147,7 +2148,7 @@ smb3_notify(const unsigned int xid, struct file *pfile,
tcon = cifs_sb_master_tcon(cifs_sb);
oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
oparms.disposition = FILE_OPEN;
oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.fid = &fid;
@@ -3187,6 +3188,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
ses->Suid, offset, len);
+ /*
+ * We zero the range through ioctl, so we need remove the page caches
+ * first, otherwise the data may be inconsistent with the server.
+ */
+ truncate_pagecache_range(inode, offset, offset + len - 1);
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
@@ -3253,6 +3259,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
return rc;
}
+ /*
+ * We implement the punch hole through ioctl, so we need remove the page
+ * caches first, otherwise the data may be inconsistent with the server.
+ */
+ truncate_pagecache_range(inode, offset, offset + len - 1);
+
cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len);
fsctl_buf.FileOffset = cpu_to_le64(offset);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index d11e31064679..84433d0653f9 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -523,7 +523,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
const int timeout, const int flags,
unsigned int *instance)
{
- int rc;
+ long rc;
int *credits;
int optype;
long int t;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 8046d7c7a3e9..a5f5c30368a2 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -4,6 +4,7 @@ config FS_ENCRYPTION
select CRYPTO
select CRYPTO_HASH
select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_SHA256
select KEYS
help
Enable encryption of files and directories. This
@@ -21,6 +22,11 @@ config FS_ENCRYPTION_ALGS
select CRYPTO_CTS
select CRYPTO_ECB
select CRYPTO_HMAC
- select CRYPTO_SHA256
select CRYPTO_SHA512
select CRYPTO_XTS
+
+config FS_ENCRYPTION_INLINE_CRYPT
+ bool "Enable fscrypt to use inline crypto"
+ depends on FS_ENCRYPTION && BLK_INLINE_ENCRYPTION
+ help
+ Enable fscrypt to use inline encryption hardware if available.
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index 232e2bb5a337..652c7180ec6d 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -11,3 +11,4 @@ fscrypto-y := crypto.o \
policy.o
fscrypto-$(CONFIG_BLOCK) += bio.o
+fscrypto-$(CONFIG_FS_ENCRYPTION_INLINE_CRYPT) += inline_crypt.o
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 4fa18fff9c4e..b048a0e38516 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -41,6 +41,53 @@ void fscrypt_decrypt_bio(struct bio *bio)
}
EXPORT_SYMBOL(fscrypt_decrypt_bio);
+static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
+ pgoff_t lblk, sector_t pblk,
+ unsigned int len)
+{
+ const unsigned int blockbits = inode->i_blkbits;
+ const unsigned int blocks_per_page = 1 << (PAGE_SHIFT - blockbits);
+ struct bio *bio;
+ int ret, err = 0;
+ int num_pages = 0;
+
+ /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
+ bio = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+
+ while (len) {
+ unsigned int blocks_this_page = min(len, blocks_per_page);
+ unsigned int bytes_this_page = blocks_this_page << blockbits;
+
+ if (num_pages == 0) {
+ fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS);
+ bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio->bi_iter.bi_sector =
+ pblk << (blockbits - SECTOR_SHIFT);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ }
+ ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
+ if (WARN_ON(ret != bytes_this_page)) {
+ err = -EIO;
+ goto out;
+ }
+ num_pages++;
+ len -= blocks_this_page;
+ lblk += blocks_this_page;
+ pblk += blocks_this_page;
+ if (num_pages == BIO_MAX_PAGES || !len ||
+ !fscrypt_mergeable_bio(bio, inode, lblk)) {
+ err = submit_bio_wait(bio);
+ if (err)
+ goto out;
+ bio_reset(bio);
+ num_pages = 0;
+ }
+ }
+out:
+ bio_put(bio);
+ return err;
+}
+
/**
* fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file
* @inode: the file's inode
@@ -75,6 +122,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
if (len == 0)
return 0;
+ if (fscrypt_inode_uses_inline_crypto(inode))
+ return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk,
+ len);
+
BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES);
nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
(len + blocks_per_page - 1) >> blocks_per_page_bits);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index ed015cb66c7c..9212325763b0 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -84,7 +84,7 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
WARN_ON_ONCE(lblk_num > U32_MAX);
lblk_num = (u32)(ci->ci_hashed_ino + lblk_num);
} else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
- memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE);
}
iv->lblk_num = cpu_to_le64(lblk_num);
}
@@ -100,7 +100,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
struct fscrypt_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
int res = 0;
if (WARN_ON_ONCE(len <= 0))
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 83ca5f1e7934..011830f84d8d 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -61,30 +61,13 @@ struct fscrypt_nokey_name {
*/
#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256)
-static struct crypto_shash *sha256_hash_tfm;
-
-static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
+static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
{
- struct crypto_shash *tfm = READ_ONCE(sha256_hash_tfm);
-
- if (unlikely(!tfm)) {
- struct crypto_shash *prev_tfm;
-
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- fscrypt_err(NULL,
- "Error allocating SHA-256 transform: %ld",
- PTR_ERR(tfm));
- return PTR_ERR(tfm);
- }
- prev_tfm = cmpxchg(&sha256_hash_tfm, NULL, tfm);
- if (prev_tfm) {
- crypto_free_shash(tfm);
- tfm = prev_tfm;
- }
- }
+ struct sha256_state sctx;
- return crypto_shash_tfm_digest(tfm, data, data_len, result);
+ sha256_init(&sctx);
+ sha256_update(&sctx, data, data_len);
+ sha256_final(&sctx, result);
}
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
@@ -115,7 +98,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
const struct fscrypt_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
struct scatterlist sg;
int res;
@@ -171,7 +154,7 @@ static int fname_decrypt(const struct inode *inode,
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
const struct fscrypt_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
int res;
@@ -349,7 +332,6 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
const struct qstr qname = FSTR_TO_QSTR(iname);
struct fscrypt_nokey_name nokey_name;
u32 size; /* size of the unencoded no-key name */
- int err;
if (fscrypt_is_dot_dotdot(&qname)) {
oname->name[0] = '.';
@@ -387,11 +369,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
} else {
memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes));
/* Compute strong hash of remaining part of name. */
- err = fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)],
- iname->len - sizeof(nokey_name.bytes),
- nokey_name.sha256);
- if (err)
- return err;
+ fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)],
+ iname->len - sizeof(nokey_name.bytes),
+ nokey_name.sha256);
size = FSCRYPT_NOKEY_NAME_MAX;
}
oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name);
@@ -530,9 +510,8 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
return false;
if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes)))
return false;
- if (fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)],
- de_name_len - sizeof(nokey_name->bytes), sha256))
- return false;
+ fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)],
+ de_name_len - sizeof(nokey_name->bytes), sha256);
return !memcmp(sha256, nokey_name->sha256, sizeof(sha256));
}
EXPORT_SYMBOL_GPL(fscrypt_match_name);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index eb7fcd2b7fb8..8117a61b6f55 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -14,12 +14,13 @@
#include <linux/fscrypt.h>
#include <linux/siphash.h>
#include <crypto/hash.h>
+#include <linux/blk-crypto.h>
#define CONST_STRLEN(str) (sizeof(str) - 1)
-#define FS_KEY_DERIVATION_NONCE_SIZE 16
+#define FSCRYPT_FILE_NONCE_SIZE 16
-#define FSCRYPT_MIN_KEY_SIZE 16
+#define FSCRYPT_MIN_KEY_SIZE 16
#define FSCRYPT_CONTEXT_V1 1
#define FSCRYPT_CONTEXT_V2 2
@@ -30,7 +31,7 @@ struct fscrypt_context_v1 {
u8 filenames_encryption_mode;
u8 flags;
u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
- u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};
struct fscrypt_context_v2 {
@@ -40,7 +41,7 @@ struct fscrypt_context_v2 {
u8 flags;
u8 __reserved[4];
u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
- u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};
/*
@@ -166,6 +167,20 @@ struct fscrypt_symlink_data {
char encrypted_path[1];
} __packed;
+/**
+ * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption
+ * @tfm: crypto API transform object
+ * @blk_key: key for blk-crypto
+ *
+ * Normally only one of the fields will be non-NULL.
+ */
+struct fscrypt_prepared_key {
+ struct crypto_skcipher *tfm;
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ struct fscrypt_blk_crypto_key *blk_key;
+#endif
+};
+
/*
* fscrypt_info - the "encryption key" for an inode
*
@@ -175,12 +190,20 @@ struct fscrypt_symlink_data {
*/
struct fscrypt_info {
- /* The actual crypto transform used for encryption and decryption */
- struct crypto_skcipher *ci_ctfm;
+ /* The key in a form prepared for actual encryption/decryption */
+ struct fscrypt_prepared_key ci_enc_key;
- /* True if the key should be freed when this fscrypt_info is freed */
+ /* True if ci_enc_key should be freed when this fscrypt_info is freed */
bool ci_owns_key;
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ /*
+ * True if this inode will use inline encryption (blk-crypto) instead of
+ * the traditional filesystem-layer encryption.
+ */
+ bool ci_inlinecrypt;
+#endif
+
/*
* Encryption mode used for this inode. It corresponds to either the
* contents or filenames encryption mode, depending on the inode type.
@@ -205,7 +228,7 @@ struct fscrypt_info {
/*
* If non-NULL, then encryption is done using the master key directly
- * and ci_ctfm will equal ci_direct_key->dk_ctfm.
+ * and ci_enc_key will equal ci_direct_key->dk_key.
*/
struct fscrypt_direct_key *ci_direct_key;
@@ -221,7 +244,7 @@ struct fscrypt_info {
union fscrypt_policy ci_policy;
/* This inode's nonce, copied from the fscrypt_context */
- u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE];
/* Hashed inode number. Only set for IV_INO_LBLK_32 */
u32 ci_hashed_ino;
@@ -257,9 +280,10 @@ union fscrypt_iv {
__le64 lblk_num;
/* per-file nonce; only set in DIRECT_KEY mode */
- u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};
u8 raw[FSCRYPT_MAX_IV_SIZE];
+ __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)];
};
void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
@@ -288,13 +312,13 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
* outputs are unique and cryptographically isolated, i.e. knowledge of one
* output doesn't reveal another.
*/
-#define HKDF_CONTEXT_KEY_IDENTIFIER 1
-#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2
-#define HKDF_CONTEXT_DIRECT_KEY 3
-#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4
-#define HKDF_CONTEXT_DIRHASH_KEY 5
-#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6
-#define HKDF_CONTEXT_INODE_HASH_KEY 7
+#define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */
+#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */
+#define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */
+#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */
+#define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */
+#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */
+#define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */
int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
const u8 *info, unsigned int infolen,
@@ -302,6 +326,78 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
+/* inline_crypt.c */
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+int fscrypt_select_encryption_impl(struct fscrypt_info *ci);
+
+static inline bool
+fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+{
+ return ci->ci_inlinecrypt;
+}
+
+int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key,
+ const struct fscrypt_info *ci);
+
+void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key);
+
+/*
+ * Check whether the crypto transform or blk-crypto key has been allocated in
+ * @prep_key, depending on which encryption implementation the file will use.
+ */
+static inline bool
+fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
+ const struct fscrypt_info *ci)
+{
+ /*
+ * The two smp_load_acquire()'s here pair with the smp_store_release()'s
+ * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key().
+ * I.e., in some cases (namely, if this prep_key is a per-mode
+ * encryption key) another task can publish blk_key or tfm concurrently,
+ * executing a RELEASE barrier. We need to use smp_load_acquire() here
+ * to safely ACQUIRE the memory the other task published.
+ */
+ if (fscrypt_using_inline_encryption(ci))
+ return smp_load_acquire(&prep_key->blk_key) != NULL;
+ return smp_load_acquire(&prep_key->tfm) != NULL;
+}
+
+#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
+
+static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+{
+ return 0;
+}
+
+static inline bool
+fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+{
+ return false;
+}
+
+static inline int
+fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key,
+ const struct fscrypt_info *ci)
+{
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+}
+
+static inline void
+fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+{
+}
+
+static inline bool
+fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
+ const struct fscrypt_info *ci)
+{
+ return smp_load_acquire(&prep_key->tfm) != NULL;
+}
+#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
+
/* keyring.c */
/*
@@ -395,9 +491,9 @@ struct fscrypt_master_key {
* Per-mode encryption keys for the various types of encryption policies
* that use them. Allocated and derived on-demand.
*/
- struct crypto_skcipher *mk_direct_keys[__FSCRYPT_MODE_MAX + 1];
- struct crypto_skcipher *mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1];
- struct crypto_skcipher *mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1];
+ struct fscrypt_prepared_key mk_direct_keys[__FSCRYPT_MODE_MAX + 1];
+ struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1];
+ struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1];
/* Hash key for inode numbers. Initialized only when needed. */
siphash_key_t mk_ino_hash_key;
@@ -461,13 +557,15 @@ struct fscrypt_mode {
int keysize;
int ivsize;
int logged_impl_name;
+ enum blk_crypto_mode_num blk_crypto_mode;
};
extern struct fscrypt_mode fscrypt_modes[];
-struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
- const u8 *raw_key,
- const struct inode *inode);
+int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key, const struct fscrypt_info *ci);
+
+void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key);
int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
new file mode 100644
index 000000000000..b6b8574caa13
--- /dev/null
+++ b/fs/crypto/inline_crypt.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Inline encryption support for fscrypt
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * With "inline encryption", the block layer handles the decryption/encryption
+ * as part of the bio, instead of the filesystem doing the crypto itself via
+ * crypto API. See Documentation/block/inline-encryption.rst. fscrypt still
+ * provides the key and IV to use.
+ */
+
+#include <linux/blk-crypto.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/sched/mm.h>
+
+#include "fscrypt_private.h"
+
+struct fscrypt_blk_crypto_key {
+ struct blk_crypto_key base;
+ int num_devs;
+ struct request_queue *devs[];
+};
+
+static int fscrypt_get_num_devices(struct super_block *sb)
+{
+ if (sb->s_cop->get_num_devices)
+ return sb->s_cop->get_num_devices(sb);
+ return 1;
+}
+
+static void fscrypt_get_devices(struct super_block *sb, int num_devs,
+ struct request_queue **devs)
+{
+ if (num_devs == 1)
+ devs[0] = bdev_get_queue(sb->s_bdev);
+ else
+ sb->s_cop->get_devices(sb, devs);
+}
+
+static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
+{
+ struct super_block *sb = ci->ci_inode->i_sb;
+ unsigned int flags = fscrypt_policy_flags(&ci->ci_policy);
+ int ino_bits = 64, lblk_bits = 64;
+
+ if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
+ return offsetofend(union fscrypt_iv, nonce);
+
+ if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64)
+ return sizeof(__le64);
+
+ if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)
+ return sizeof(__le32);
+
+ /* Default case: IVs are just the file logical block number */
+ if (sb->s_cop->get_ino_and_lblk_bits)
+ sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
+ return DIV_ROUND_UP(lblk_bits, 8);
+}
+
+/* Enable inline encryption for this file if supported. */
+int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+{
+ const struct inode *inode = ci->ci_inode;
+ struct super_block *sb = inode->i_sb;
+ struct blk_crypto_config crypto_cfg;
+ int num_devs;
+ struct request_queue **devs;
+ int i;
+
+ /* The file must need contents encryption, not filenames encryption */
+ if (!fscrypt_needs_contents_encryption(inode))
+ return 0;
+
+ /* The crypto mode must have a blk-crypto counterpart */
+ if (ci->ci_mode->blk_crypto_mode == BLK_ENCRYPTION_MODE_INVALID)
+ return 0;
+
+ /* The filesystem must be mounted with -o inlinecrypt */
+ if (!(sb->s_flags & SB_INLINECRYPT))
+ return 0;
+
+ /*
+ * When a page contains multiple logically contiguous filesystem blocks,
+ * some filesystem code only calls fscrypt_mergeable_bio() for the first
+ * block in the page. This is fine for most of fscrypt's IV generation
+ * strategies, where contiguous blocks imply contiguous IVs. But it
+ * doesn't work with IV_INO_LBLK_32. For now, simply exclude
+ * IV_INO_LBLK_32 with blocksize != PAGE_SIZE from inline encryption.
+ */
+ if ((fscrypt_policy_flags(&ci->ci_policy) &
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
+ sb->s_blocksize != PAGE_SIZE)
+ return 0;
+
+ /*
+ * On all the filesystem's devices, blk-crypto must support the crypto
+ * configuration that the file would use.
+ */
+ crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
+ crypto_cfg.data_unit_size = sb->s_blocksize;
+ crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
+ num_devs = fscrypt_get_num_devices(sb);
+ devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS);
+ if (!devs)
+ return -ENOMEM;
+ fscrypt_get_devices(sb, num_devs, devs);
+
+ for (i = 0; i < num_devs; i++) {
+ if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
+ goto out_free_devs;
+ }
+
+ ci->ci_inlinecrypt = true;
+out_free_devs:
+ kfree(devs);
+
+ return 0;
+}
+
+int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key,
+ const struct fscrypt_info *ci)
+{
+ const struct inode *inode = ci->ci_inode;
+ struct super_block *sb = inode->i_sb;
+ enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode;
+ int num_devs = fscrypt_get_num_devices(sb);
+ int queue_refs = 0;
+ struct fscrypt_blk_crypto_key *blk_key;
+ int err;
+ int i;
+ unsigned int flags;
+
+ blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS);
+ if (!blk_key)
+ return -ENOMEM;
+
+ blk_key->num_devs = num_devs;
+ fscrypt_get_devices(sb, num_devs, blk_key->devs);
+
+ err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode,
+ fscrypt_get_dun_bytes(ci), sb->s_blocksize);
+ if (err) {
+ fscrypt_err(inode, "error %d initializing blk-crypto key", err);
+ goto fail;
+ }
+
+ /*
+ * We have to start using blk-crypto on all the filesystem's devices.
+ * We also have to save all the request_queue's for later so that the
+ * key can be evicted from them. This is needed because some keys
+ * aren't destroyed until after the filesystem was already unmounted
+ * (namely, the per-mode keys in struct fscrypt_master_key).
+ */
+ for (i = 0; i < num_devs; i++) {
+ if (!blk_get_queue(blk_key->devs[i])) {
+ fscrypt_err(inode, "couldn't get request_queue");
+ err = -EAGAIN;
+ goto fail;
+ }
+ queue_refs++;
+
+ flags = memalloc_nofs_save();
+ err = blk_crypto_start_using_key(&blk_key->base,
+ blk_key->devs[i]);
+ memalloc_nofs_restore(flags);
+ if (err) {
+ fscrypt_err(inode,
+ "error %d starting to use blk-crypto", err);
+ goto fail;
+ }
+ }
+ /*
+ * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared().
+ * I.e., here we publish ->blk_key with a RELEASE barrier so that
+ * concurrent tasks can ACQUIRE it. Note that this concurrency is only
+ * possible for per-mode keys, not for per-file keys.
+ */
+ smp_store_release(&prep_key->blk_key, blk_key);
+ return 0;
+
+fail:
+ for (i = 0; i < queue_refs; i++)
+ blk_put_queue(blk_key->devs[i]);
+ kzfree(blk_key);
+ return err;
+}
+
+void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+{
+ struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key;
+ int i;
+
+ if (blk_key) {
+ for (i = 0; i < blk_key->num_devs; i++) {
+ blk_crypto_evict_key(blk_key->devs[i], &blk_key->base);
+ blk_put_queue(blk_key->devs[i]);
+ }
+ kzfree(blk_key);
+ }
+}
+
+bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
+{
+ return inode->i_crypt_info->ci_inlinecrypt;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
+
+static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
+ u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
+{
+ union fscrypt_iv iv;
+ int i;
+
+ fscrypt_generate_iv(&iv, lblk_num, ci);
+
+ BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE);
+ memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE);
+ for (i = 0; i < ci->ci_mode->ivsize/sizeof(dun[0]); i++)
+ dun[i] = le64_to_cpu(iv.dun[i]);
+}
+
+/**
+ * fscrypt_set_bio_crypt_ctx() - prepare a file contents bio for inline crypto
+ * @bio: a bio which will eventually be submitted to the file
+ * @inode: the file's inode
+ * @first_lblk: the first file logical block number in the I/O
+ * @gfp_mask: memory allocation flags - these must be a waiting mask so that
+ * bio_crypt_set_ctx can't fail.
+ *
+ * If the contents of the file should be encrypted (or decrypted) with inline
+ * encryption, then assign the appropriate encryption context to the bio.
+ *
+ * Normally the bio should be newly allocated (i.e. no pages added yet), as
+ * otherwise fscrypt_mergeable_bio() won't work as intended.
+ *
+ * The encryption context will be freed automatically when the bio is freed.
+ */
+void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
+ u64 first_lblk, gfp_t gfp_mask)
+{
+ const struct fscrypt_info *ci;
+ u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return;
+ ci = inode->i_crypt_info;
+
+ fscrypt_generate_dun(ci, first_lblk, dun);
+ bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx);
+
+/* Extract the inode and logical block number from a buffer_head. */
+static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh,
+ const struct inode **inode_ret,
+ u64 *lblk_num_ret)
+{
+ struct page *page = bh->b_page;
+ const struct address_space *mapping;
+ const struct inode *inode;
+
+ /*
+ * The ext4 journal (jbd2) can submit a buffer_head it directly created
+ * for a non-pagecache page. fscrypt doesn't care about these.
+ */
+ mapping = page_mapping(page);
+ if (!mapping)
+ return false;
+ inode = mapping->host;
+
+ *inode_ret = inode;
+ *lblk_num_ret = ((u64)page->index << (PAGE_SHIFT - inode->i_blkbits)) +
+ (bh_offset(bh) >> inode->i_blkbits);
+ return true;
+}
+
+/**
+ * fscrypt_set_bio_crypt_ctx_bh() - prepare a file contents bio for inline
+ * crypto
+ * @bio: a bio which will eventually be submitted to the file
+ * @first_bh: the first buffer_head for which I/O will be submitted
+ * @gfp_mask: memory allocation flags
+ *
+ * Same as fscrypt_set_bio_crypt_ctx(), except this takes a buffer_head instead
+ * of an inode and block number directly.
+ */
+void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio,
+ const struct buffer_head *first_bh,
+ gfp_t gfp_mask)
+{
+ const struct inode *inode;
+ u64 first_lblk;
+
+ if (bh_get_inode_and_lblk_num(first_bh, &inode, &first_lblk))
+ fscrypt_set_bio_crypt_ctx(bio, inode, first_lblk, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh);
+
+/**
+ * fscrypt_mergeable_bio() - test whether data can be added to a bio
+ * @bio: the bio being built up
+ * @inode: the inode for the next part of the I/O
+ * @next_lblk: the next file logical block number in the I/O
+ *
+ * When building a bio which may contain data which should undergo inline
+ * encryption (or decryption) via fscrypt, filesystems should call this function
+ * to ensure that the resulting bio contains only contiguous data unit numbers.
+ * This will return false if the next part of the I/O cannot be merged with the
+ * bio because either the encryption key would be different or the encryption
+ * data unit numbers would be discontiguous.
+ *
+ * fscrypt_set_bio_crypt_ctx() must have already been called on the bio.
+ *
+ * Return: true iff the I/O is mergeable
+ */
+bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
+ u64 next_lblk)
+{
+ const struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+ u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+
+ if (!!bc != fscrypt_inode_uses_inline_crypto(inode))
+ return false;
+ if (!bc)
+ return true;
+
+ /*
+ * Comparing the key pointers is good enough, as all I/O for each key
+ * uses the same pointer. I.e., there's currently no need to support
+ * merging requests where the keys are the same but the pointers differ.
+ */
+ if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base)
+ return false;
+
+ fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun);
+ return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun);
+}
+EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio);
+
+/**
+ * fscrypt_mergeable_bio_bh() - test whether data can be added to a bio
+ * @bio: the bio being built up
+ * @next_bh: the next buffer_head for which I/O will be submitted
+ *
+ * Same as fscrypt_mergeable_bio(), except this takes a buffer_head instead of
+ * an inode and block number directly.
+ *
+ * Return: true iff the I/O is mergeable
+ */
+bool fscrypt_mergeable_bio_bh(struct bio *bio,
+ const struct buffer_head *next_bh)
+{
+ const struct inode *inode;
+ u64 next_lblk;
+
+ if (!bh_get_inode_and_lblk_num(next_bh, &inode, &next_lblk))
+ return !bio->bi_crypt_context;
+
+ return fscrypt_mergeable_bio(bio, inode, next_lblk);
+}
+EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index e24eb48bfbe1..71d56f8e2870 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -45,9 +45,9 @@ static void free_master_key(struct fscrypt_master_key *mk)
wipe_master_key_secret(&mk->mk_secret);
for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) {
- crypto_free_skcipher(mk->mk_direct_keys[i]);
- crypto_free_skcipher(mk->mk_iv_ino_lblk_64_keys[i]);
- crypto_free_skcipher(mk->mk_iv_ino_lblk_32_keys[i]);
+ fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]);
+ fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]);
+ fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]);
}
key_put(mk->mk_users);
@@ -213,7 +213,11 @@ static int allocate_filesystem_keyring(struct super_block *sb)
if (IS_ERR(keyring))
return PTR_ERR(keyring);
- /* Pairs with READ_ONCE() in fscrypt_find_master_key() */
+ /*
+ * Pairs with the smp_load_acquire() in fscrypt_find_master_key().
+ * I.e., here we publish ->s_master_keys with a RELEASE barrier so that
+ * concurrent tasks can ACQUIRE it.
+ */
smp_store_release(&sb->s_master_keys, keyring);
return 0;
}
@@ -234,8 +238,13 @@ struct key *fscrypt_find_master_key(struct super_block *sb,
struct key *keyring;
char description[FSCRYPT_MK_DESCRIPTION_SIZE];
- /* pairs with smp_store_release() in allocate_filesystem_keyring() */
- keyring = READ_ONCE(sb->s_master_keys);
+ /*
+ * Pairs with the smp_store_release() in allocate_filesystem_keyring().
+ * I.e., another task can publish ->s_master_keys concurrently,
+ * executing a RELEASE barrier. We need to use smp_load_acquire() here
+ * to safely ACQUIRE the memory the other task published.
+ */
+ keyring = smp_load_acquire(&sb->s_master_keys);
if (keyring == NULL)
return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 1129adfa097d..fea6226afc2b 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.cipher_str = "xts(aes)",
.keysize = 64,
.ivsize = 16,
+ .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
},
[FSCRYPT_MODE_AES_256_CTS] = {
.friendly_name = "AES-256-CTS-CBC",
@@ -31,6 +32,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
.ivsize = 16,
+ .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
},
[FSCRYPT_MODE_AES_128_CTS] = {
.friendly_name = "AES-128-CTS-CBC",
@@ -43,6 +45,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
.ivsize = 32,
+ .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
},
};
@@ -64,9 +67,9 @@ select_encryption_mode(const union fscrypt_policy *policy,
}
/* Create a symmetric cipher object for the given encryption mode and key */
-struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
- const u8 *raw_key,
- const struct inode *inode)
+static struct crypto_skcipher *
+fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
+ const struct inode *inode)
{
struct crypto_skcipher *tfm;
int err;
@@ -109,30 +112,56 @@ err_free_tfm:
return ERR_PTR(err);
}
-/* Given a per-file encryption key, set up the file's crypto transform object */
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
+/*
+ * Prepare the crypto transform object or blk-crypto key in @prep_key, given the
+ * raw key, encryption mode, and flag indicating which encryption implementation
+ * (fs-layer or blk-crypto) will be used.
+ */
+int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key, const struct fscrypt_info *ci)
{
struct crypto_skcipher *tfm;
+ if (fscrypt_using_inline_encryption(ci))
+ return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci);
+
tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode);
if (IS_ERR(tfm))
return PTR_ERR(tfm);
+ /*
+ * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared().
+ * I.e., here we publish ->tfm with a RELEASE barrier so that
+ * concurrent tasks can ACQUIRE it. Note that this concurrency is only
+ * possible for per-mode keys, not for per-file keys.
+ */
+ smp_store_release(&prep_key->tfm, tfm);
+ return 0;
+}
- ci->ci_ctfm = tfm;
+/* Destroy a crypto transform object and/or blk-crypto key. */
+void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key)
+{
+ crypto_free_skcipher(prep_key->tfm);
+ fscrypt_destroy_inline_crypt_key(prep_key);
+}
+
+/* Given a per-file encryption key, set up the file's crypto transform object */
+int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
+{
ci->ci_owns_key = true;
- return 0;
+ return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci);
}
static int setup_per_mode_enc_key(struct fscrypt_info *ci,
struct fscrypt_master_key *mk,
- struct crypto_skcipher **tfms,
+ struct fscrypt_prepared_key *keys,
u8 hkdf_context, bool include_fs_uuid)
{
const struct inode *inode = ci->ci_inode;
const struct super_block *sb = inode->i_sb;
struct fscrypt_mode *mode = ci->ci_mode;
const u8 mode_num = mode - fscrypt_modes;
- struct crypto_skcipher *tfm;
+ struct fscrypt_prepared_key *prep_key;
u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)];
unsigned int hkdf_infolen = 0;
@@ -141,16 +170,15 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX))
return -EINVAL;
- /* pairs with smp_store_release() below */
- tfm = READ_ONCE(tfms[mode_num]);
- if (likely(tfm != NULL)) {
- ci->ci_ctfm = tfm;
+ prep_key = &keys[mode_num];
+ if (fscrypt_is_key_prepared(prep_key, ci)) {
+ ci->ci_enc_key = *prep_key;
return 0;
}
mutex_lock(&fscrypt_mode_key_setup_mutex);
- if (tfms[mode_num])
+ if (fscrypt_is_key_prepared(prep_key, ci))
goto done_unlock;
BUILD_BUG_ON(sizeof(mode_num) != 1);
@@ -167,16 +195,12 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
mode_key, mode->keysize);
if (err)
goto out_unlock;
- tfm = fscrypt_allocate_skcipher(mode, mode_key, inode);
+ err = fscrypt_prepare_key(prep_key, mode_key, ci);
memzero_explicit(mode_key, mode->keysize);
- if (IS_ERR(tfm)) {
- err = PTR_ERR(tfm);
+ if (err)
goto out_unlock;
- }
- /* pairs with READ_ONCE() above */
- smp_store_release(&tfms[mode_num], tfm);
done_unlock:
- ci->ci_ctfm = tfm;
+ ci->ci_enc_key = *prep_key;
err = 0;
out_unlock:
mutex_unlock(&fscrypt_mode_key_setup_mutex);
@@ -189,7 +213,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
int err;
err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY,
- ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE,
+ ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
(u8 *)&ci->ci_dirhash_key,
sizeof(ci->ci_dirhash_key));
if (err)
@@ -270,8 +294,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
HKDF_CONTEXT_PER_FILE_ENC_KEY,
- ci->ci_nonce,
- FS_KEY_DERIVATION_NONCE_SIZE,
+ ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
derived_key, ci->ci_mode->keysize);
if (err)
return err;
@@ -310,6 +333,10 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
struct fscrypt_key_specifier mk_spec;
int err;
+ err = fscrypt_select_encryption_impl(ci);
+ if (err)
+ return err;
+
switch (ci->ci_policy.version) {
case FSCRYPT_POLICY_V1:
mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
@@ -402,7 +429,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
if (ci->ci_direct_key)
fscrypt_put_direct_key(ci->ci_direct_key);
else if (ci->ci_owns_key)
- crypto_free_skcipher(ci->ci_ctfm);
+ fscrypt_destroy_prepared_key(&ci->ci_enc_key);
key = ci->ci_master_key;
if (key) {
@@ -472,7 +499,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
}
memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx),
- FS_KEY_DERIVATION_NONCE_SIZE);
+ FSCRYPT_FILE_NONCE_SIZE);
if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
res = -EINVAL;
@@ -491,7 +518,17 @@ int fscrypt_get_encryption_info(struct inode *inode)
if (res)
goto out;
+ /*
+ * Multiple tasks may race to set ->i_crypt_info, so use
+ * cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a
+ * RELEASE barrier so that other tasks can ACQUIRE it.
+ */
if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
+ /*
+ * We won the race and set ->i_crypt_info to our crypt_info.
+ * Now link it into the master key's inode list.
+ */
if (master_key) {
struct fscrypt_master_key *mk =
master_key->payload.data[0];
@@ -562,7 +599,7 @@ EXPORT_SYMBOL(fscrypt_free_inode);
*/
int fscrypt_drop_inode(struct inode *inode)
{
- const struct fscrypt_info *ci = READ_ONCE(inode->i_crypt_info);
+ const struct fscrypt_info *ci = fscrypt_get_info(inode);
const struct fscrypt_master_key *mk;
/*
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 801b48c0cd7f..e4e707fb1100 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -45,7 +45,7 @@ static DEFINE_SPINLOCK(fscrypt_direct_keys_lock);
* key is longer, then only the first 'derived_keysize' bytes are used.
*/
static int derive_key_aes(const u8 *master_key,
- const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE],
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
u8 *derived_key, unsigned int derived_keysize)
{
int res = 0;
@@ -68,7 +68,7 @@ static int derive_key_aes(const u8 *master_key,
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
crypto_req_done, &wait);
- res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ res = crypto_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE);
if (res < 0)
goto out;
@@ -146,7 +146,7 @@ struct fscrypt_direct_key {
struct hlist_node dk_node;
refcount_t dk_refcount;
const struct fscrypt_mode *dk_mode;
- struct crypto_skcipher *dk_ctfm;
+ struct fscrypt_prepared_key dk_key;
u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
u8 dk_raw[FSCRYPT_MAX_KEY_SIZE];
};
@@ -154,7 +154,7 @@ struct fscrypt_direct_key {
static void free_direct_key(struct fscrypt_direct_key *dk)
{
if (dk) {
- crypto_free_skcipher(dk->dk_ctfm);
+ fscrypt_destroy_prepared_key(&dk->dk_key);
kzfree(dk);
}
}
@@ -199,6 +199,8 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
continue;
if (ci->ci_mode != dk->dk_mode)
continue;
+ if (!fscrypt_is_key_prepared(&dk->dk_key, ci))
+ continue;
if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize))
continue;
/* using existing tfm with same (descriptor, mode, raw_key) */
@@ -231,13 +233,9 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
return ERR_PTR(-ENOMEM);
refcount_set(&dk->dk_refcount, 1);
dk->dk_mode = ci->ci_mode;
- dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key,
- ci->ci_inode);
- if (IS_ERR(dk->dk_ctfm)) {
- err = PTR_ERR(dk->dk_ctfm);
- dk->dk_ctfm = NULL;
+ err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci);
+ if (err)
goto err_free_dk;
- }
memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize);
@@ -259,7 +257,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci,
if (IS_ERR(dk))
return PTR_ERR(dk);
ci->ci_direct_key = dk;
- ci->ci_ctfm = dk->dk_ctfm;
+ ci->ci_enc_key = dk->dk_key;
return 0;
}
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index d23ff162c78b..2d73fd39ad96 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -78,6 +78,20 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
int ino_bits = 64, lblk_bits = 64;
/*
+ * IV_INO_LBLK_* exist only because of hardware limitations, and
+ * currently the only known use case for them involves AES-256-XTS.
+ * That's also all we test currently. For these reasons, for now only
+ * allow AES-256-XTS here. This can be relaxed later if a use case for
+ * IV_INO_LBLK_* with other encryption modes arises.
+ */
+ if (policy->contents_encryption_mode != FSCRYPT_MODE_AES_256_XTS) {
+ fscrypt_warn(inode,
+ "Can't use %s policy with contents mode other than AES-256-XTS",
+ type);
+ return false;
+ }
+
+ /*
* It's unsafe to include inode numbers in the IVs if the filesystem can
* potentially renumber inodes, e.g. via filesystem shrinking.
*/
@@ -338,7 +352,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
union fscrypt_context ctx;
int ret;
- ci = READ_ONCE(inode->i_crypt_info);
+ ci = fscrypt_get_info(inode);
if (ci) {
/* key available, use the cached policy */
*policy = ci->ci_policy;
@@ -529,7 +543,7 @@ int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
if (!fscrypt_context_is_valid(&ctx, ret))
return -EINVAL;
if (copy_to_user(arg, fscrypt_context_nonce(&ctx),
- FS_KEY_DERIVATION_NONCE_SIZE))
+ FSCRYPT_FILE_NONCE_SIZE))
return -EFAULT;
return 0;
}
@@ -627,7 +641,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
if (res < 0)
return res;
- ci = READ_ONCE(parent->i_crypt_info);
+ ci = fscrypt_get_info(parent);
if (ci == NULL)
return -ENOKEY;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 6d5370eac2a8..183299892465 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1387,8 +1387,8 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
* Attempt to prefetch the pieces we likely need later.
*/
prefetch(&bdev->bd_disk->part_tbl);
- prefetch(bdev->bd_queue);
- prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+ prefetch(bdev->bd_disk->queue);
+ prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES);
return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
end_io, submit_io, flags);
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7f550327d5d..e338c407cb75 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -113,7 +113,7 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
void dlm_timeout_warn(struct dlm_lkb *lkb)
{
- struct sk_buff *uninitialized_var(send_skb);
+ struct sk_buff *send_skb;
struct dlm_lock_data *data;
size_t size;
int rv;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index e9e27a271af0..feaa5e182b7b 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,6 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
} else {
inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
+ inode->i_mtime = current_time(inode);
inode_unlock(inode);
}
@@ -72,10 +73,8 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
ssize_t size = 0;
int err;
- while (!__ratelimit(&file->f_cred->user->ratelimit)) {
- if (!msleep_interruptible(50))
- return -EINTR;
- }
+ while (!__ratelimit(&file->f_cred->user->ratelimit))
+ msleep(50);
err = efivar_entry_size(var, &datasize);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 12c66f5d92dd..28bb5689333a 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -201,6 +201,9 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_d_op = &efivarfs_d_ops;
sb->s_time_gran = 1;
+ if (!efivar_supports_writes())
+ sb->s_flags |= SB_RDONLY;
+
inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true);
if (!inode)
return -ENOMEM;
@@ -252,9 +255,6 @@ static struct file_system_type efivarfs_type = {
static __init int efivarfs_init(void)
{
- if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES))
- return -ENODEV;
-
if (!efivars_kobject())
return -ENODEV;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 4a6ebff2af76..a4a945d0ac6a 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/vfs.h>
+#include <linux/blkdev.h>
#include "efs.h"
#include <linux/efs_vh.h>
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 64b56c7df023..d0542151e8c4 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -265,7 +265,7 @@ submit_bio_out:
*/
static int erofs_raw_access_readpage(struct file *file, struct page *page)
{
- erofs_off_t uninitialized_var(last_block);
+ erofs_off_t last_block;
struct bio *bio;
trace_erofs_readpage(page, true);
@@ -282,7 +282,7 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page)
static void erofs_raw_access_readahead(struct readahead_control *rac)
{
- erofs_off_t uninitialized_var(last_block);
+ erofs_off_t last_block;
struct bio *bio = NULL;
struct page *page;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index be50a4d9d273..24a26aaf847f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1161,7 +1161,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
/* since bio will be NULL, no need to initialize last_index */
- pgoff_t uninitialized_var(last_index);
+ pgoff_t last_index;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 7824f5563a55..9b66c28b3ae9 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -144,22 +144,22 @@ static inline void z_erofs_onlinepage_init(struct page *page)
static inline void z_erofs_onlinepage_fixup(struct page *page,
uintptr_t index, bool down)
{
- unsigned long *p, o, v, id;
-repeat:
- p = &page_private(page);
- o = READ_ONCE(*p);
+ union z_erofs_onlinepage_converter u = { .v = &page_private(page) };
+ int orig, orig_index, val;
- id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
- if (id) {
+repeat:
+ orig = atomic_read(u.o);
+ orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+ if (orig_index) {
if (!index)
return;
- DBG_BUGON(id != index);
+ DBG_BUGON(orig_index != index);
}
- v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
- ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
- if (cmpxchg(p, o, v) != o)
+ val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+ ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
+ if (atomic_cmpxchg(u.o, orig, val) != orig)
goto repeat;
}
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index de43534aa299..119abf0d8dd6 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -309,7 +309,7 @@ const struct file_operations exfat_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.iterate = exfat_iterate,
- .fsync = generic_file_fsync,
+ .fsync = exfat_file_fsync,
};
int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu)
@@ -425,10 +425,12 @@ static void exfat_init_name_entry(struct exfat_dentry *ep,
ep->dentry.name.flags = 0x0;
for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) {
- ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname);
- if (*uniname == 0x0)
- break;
- uniname++;
+ if (*uniname != 0x0) {
+ ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname);
+ uniname++;
+ } else {
+ ep->dentry.name.unicode_0_14[i] = 0x0;
+ }
}
}
@@ -1110,7 +1112,7 @@ found:
ret = exfat_get_next_cluster(sb, &clu.dir);
}
- if (ret || clu.dir != EXFAT_EOF_CLUSTER) {
+ if (ret || clu.dir == EXFAT_EOF_CLUSTER) {
/* just initialized hint_stat */
hint_stat->clu = p_dir->dir;
hint_stat->eidx = 0;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 595f3117f492..75c7bdbeba6d 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -371,7 +371,7 @@ static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi,
static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi,
unsigned int clus)
{
- return ((clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) +
+ return ((sector_t)(clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) +
sbi->data_start_sector;
}
@@ -420,6 +420,7 @@ void exfat_truncate(struct inode *inode, loff_t size);
int exfat_setattr(struct dentry *dentry, struct iattr *attr);
int exfat_getattr(const struct path *path, struct kstat *stat,
unsigned int request_mask, unsigned int query_flags);
+int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
/* namei.c */
extern const struct dentry_operations exfat_dentry_ops;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index fce03f318787..a6a063830edc 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -175,7 +176,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
ep2->dentry.stream.size = 0;
} else {
ep2->dentry.stream.valid_size = cpu_to_le64(new_size);
- ep2->dentry.stream.size = ep->dentry.stream.valid_size;
+ ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
}
if (new_size == 0) {
@@ -346,12 +347,28 @@ out:
return error;
}
+int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int err;
+
+ err = __generic_file_fsync(filp, start, end, datasync);
+ if (err)
+ return err;
+
+ err = sync_blockdev(inode->i_sb->s_bdev);
+ if (err)
+ return err;
+
+ return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+}
+
const struct file_operations exfat_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
- .fsync = generic_file_fsync,
+ .fsync = exfat_file_fsync,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 5b0f35329d63..2b9e21094a96 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -975,7 +975,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
goto unlock;
}
- exfat_set_vol_flags(sb, VOL_DIRTY);
exfat_chain_set(&clu_to_free, ei->start_clu,
EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi), ei->flags);
@@ -1002,6 +1001,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
num_entries++;
brelse(bh);
+ exfat_set_vol_flags(sb, VOL_DIRTY);
err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries);
if (err) {
exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err);
@@ -1077,10 +1077,14 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh,
&sector_old);
+ if (!epold)
+ return -EIO;
epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh,
&sector_new);
- if (!epold || !epnew)
+ if (!epnew) {
+ brelse(old_bh);
return -EIO;
+ }
memcpy(epnew, epold, DENTRY_SIZE);
exfat_update_bh(sb, new_bh, sync);
@@ -1161,10 +1165,14 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh,
&sector_mov);
+ if (!epmov)
+ return -EIO;
epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh,
&sector_new);
- if (!epmov || !epnew)
+ if (!epnew) {
+ brelse(mov_bh);
return -EIO;
+ }
memcpy(epnew, epmov, DENTRY_SIZE);
exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode));
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 57b5a7a4d1f7..a3c927501e67 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -495,7 +495,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb,
struct exfat_uni_name *p_uniname, int *p_lossy)
{
int i, unilen, lossy = NLS_NAME_NO_LOSSY;
- unsigned short upname[MAX_NAME_LENGTH + 1];
+ __le16 upname[MAX_NAME_LENGTH + 1];
unsigned short *uniname = p_uniname->name;
WARN_ON(!len);
@@ -519,7 +519,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb,
exfat_wstrchr(bad_uni_chars, *uniname))
lossy |= NLS_NAME_LOSSY;
- upname[i] = exfat_toupper(sb, *uniname);
+ upname[i] = cpu_to_le16(exfat_toupper(sb, *uniname));
uniname++;
}
@@ -597,7 +597,7 @@ static int exfat_nls_to_ucs2(struct super_block *sb,
struct exfat_uni_name *p_uniname, int *p_lossy)
{
int i = 0, unilen = 0, lossy = NLS_NAME_NO_LOSSY;
- unsigned short upname[MAX_NAME_LENGTH + 1];
+ __le16 upname[MAX_NAME_LENGTH + 1];
unsigned short *uniname = p_uniname->name;
struct nls_table *nls = EXFAT_SB(sb)->nls_io;
@@ -611,7 +611,7 @@ static int exfat_nls_to_ucs2(struct super_block *sb,
exfat_wstrchr(bad_uni_chars, *uniname))
lossy |= NLS_NAME_LOSSY;
- upname[unilen] = exfat_toupper(sb, *uniname);
+ upname[unilen] = cpu_to_le16(exfat_toupper(sb, *uniname));
uniname++;
unilen++;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index e650e65536f8..253a92460d52 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -693,10 +693,20 @@ static void exfat_free(struct fs_context *fc)
}
}
+static int exfat_reconfigure(struct fs_context *fc)
+{
+ fc->sb_flags |= SB_NODIRATIME;
+
+ /* volume flag will be updated in exfat_sync_fs */
+ sync_filesystem(fc->root->d_sb);
+ return 0;
+}
+
static const struct fs_context_operations exfat_context_ops = {
.parse_param = exfat_parse_param,
.get_tree = exfat_get_tree,
.free = exfat_free,
+ .reconfigure = exfat_reconfigure,
};
static int exfat_init_fs_context(struct fs_context *fc)
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 4ccb3c9189d8..2e42f47a7f98 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -9,7 +9,8 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
- super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
+ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
+ xattr_user.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index c654205f648d..1d82336b1cd4 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -675,6 +675,7 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
struct qstr qstr = {.name = str, .len = len };
const struct dentry *parent = READ_ONCE(dentry->d_parent);
const struct inode *inode = READ_ONCE(parent->d_inode);
+ char strbuf[DNAME_INLINE_LEN];
if (!inode || !IS_CASEFOLDED(inode) ||
!EXT4_SB(inode->i_sb)->s_encoding) {
@@ -683,6 +684,21 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
return memcmp(str, name->name, len);
}
+ /*
+ * If the dentry name is stored in-line, then it may be concurrently
+ * modified by a rename. If this happens, the VFS will eventually retry
+ * the lookup, so it doesn't matter what ->d_compare() returns.
+ * However, it's unsafe to call utf8_strncasecmp() with an unstable
+ * string. Therefore, we have to copy the name into a temporary buffer.
+ */
+ if (len <= DNAME_INLINE_LEN - 1) {
+ memcpy(strbuf, str, len);
+ strbuf[len] = 0;
+ qstr.name = strbuf;
+ /* prevent compiler from optimizing out the temporary buffer */
+ barrier();
+ }
+
return ext4_ci_compare(inode, name, &qstr, false);
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b08841f70b69..42f5060f3cdf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -426,13 +426,16 @@ struct flex_groups {
#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */
+
+#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */
+
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */
/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
@@ -440,14 +443,16 @@ struct flex_groups {
EXT4_APPEND_FL | \
EXT4_NODUMP_FL | \
EXT4_NOATIME_FL | \
- EXT4_PROJINHERIT_FL)
+ EXT4_PROJINHERIT_FL | \
+ EXT4_DAX_FL)
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
- EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
+ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
+ EXT4_DAX_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
@@ -459,6 +464,10 @@ struct flex_groups {
/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+/* Flags which are mutually exclusive to DAX */
+#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
+ EXT4_JOURNAL_DATA_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -499,6 +508,7 @@ enum {
EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
+ EXT4_INODE_DAX = 25, /* Inode is DAX */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */
@@ -1135,9 +1145,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
-#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
+#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */
#else
-#define EXT4_MOUNT_DAX 0
+#define EXT4_MOUNT_DAX_ALWAYS 0
#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
@@ -1180,6 +1190,8 @@ struct ext4_inode_info {
blocks */
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
+#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */
+#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
specified journal checksum */
@@ -1992,6 +2004,7 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
*/
#define EXT4_FLAGS_RESIZING 0
#define EXT4_FLAGS_SHUTDOWN 1
+#define EXT4_FLAGS_BDEV_IS_DAX 2
static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
{
@@ -2705,7 +2718,7 @@ extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7d088ff1e902..221f240eae60 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2844,7 +2844,7 @@ again:
* in use to avoid freeing it when removing blocks.
*/
if (sbi->s_cluster_ratio > 1) {
- pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+ pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
partial.pclu = EXT4_B2C(sbi, pblk);
partial.state = nofree;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 54d324e80fe5..df25d38d6539 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1116,7 +1116,7 @@ got:
ei->i_block_group = group;
ei->i_last_alloc_group = ~0;
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 40ec5c7ef0d3..44bad4bb8831 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1096,7 +1096,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
}
if (unlikely(err)) {
page_zero_new_buffers(page, from, to);
- } else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
+ } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) {
int err2;
@@ -3737,7 +3737,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
- if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
@@ -4403,9 +4403,11 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}
-static bool ext4_should_use_dax(struct inode *inode)
+static bool ext4_should_enable_dax(struct inode *inode)
{
- if (!test_opt(inode->i_sb, DAX))
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (test_opt2(inode->i_sb, DAX_NEVER))
return false;
if (!S_ISREG(inode->i_mode))
return false;
@@ -4417,14 +4419,21 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
return false;
- return true;
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
+ return false;
+ if (test_opt(inode->i_sb, DAX_ALWAYS))
+ return true;
+
+ return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}
-void ext4_set_inode_flags(struct inode *inode)
+void ext4_set_inode_flags(struct inode *inode, bool init)
{
unsigned int flags = EXT4_I(inode)->i_flags;
unsigned int new_fl = 0;
+ WARN_ON_ONCE(IS_DAX(inode) && init);
+
if (flags & EXT4_SYNC_FL)
new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
@@ -4435,8 +4444,13 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (ext4_should_use_dax(inode))
+
+ /* Because of the way inode_set_flags() works we must preserve S_DAX
+ * here if already set. */
+ new_fl |= (inode->i_flags & S_DAX);
+ if (init && ext4_should_enable_dax(inode))
new_fl |= S_DAX;
+
if (flags & EXT4_ENCRYPT_FL)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
@@ -4650,7 +4664,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
if (ext4_has_feature_64bit(sb))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2162db0c747d..999cf6add39c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -292,6 +292,38 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
return 0;
}
+static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (test_opt2(inode->i_sb, DAX_NEVER) ||
+ test_opt(inode->i_sb, DAX_ALWAYS))
+ return;
+
+ if ((ei->i_flags ^ flags) & EXT4_DAX_FL)
+ d_mark_dontcache(inode);
+}
+
+static bool dax_compatible(struct inode *inode, unsigned int oldflags,
+ unsigned int flags)
+{
+ if (flags & EXT4_DAX_FL) {
+ if ((oldflags & EXT4_DAX_MUT_EXCL) ||
+ ext4_test_inode_state(inode,
+ EXT4_STATE_VERITY_IN_PROGRESS)) {
+ return false;
+ }
+ }
+
+ if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL))
+ return false;
+
+ return true;
+}
+
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
{
@@ -300,7 +332,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
int err = -EPERM, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
- unsigned int jflag;
struct super_block *sb = inode->i_sb;
/* Is it quota file? Do not allow user to mess with it */
@@ -309,9 +340,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
oldflags = ei->i_flags;
- /* The JOURNAL_DATA flag is modifiable only by root */
- jflag = flags & EXT4_JOURNAL_DATA_FL;
-
err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
if (err)
goto flags_out;
@@ -320,10 +348,16 @@ static int ext4_ioctl_setflags(struct inode *inode,
* The JOURNAL_DATA flag can only be changed by
* the relevant capability.
*/
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
+
+ if (!dax_compatible(inode, oldflags, flags)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+
if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
@@ -369,6 +403,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
if (err)
goto flags_err;
+ ext4_dax_dontcache(inode, flags);
+
for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
if (!(mask & EXT4_FL_USER_MODIFIABLE))
continue;
@@ -381,7 +417,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_clear_inode_flag(inode, i);
}
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
+
inode->i_ctime = current_time(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
@@ -390,17 +427,18 @@ flags_err:
if (err)
goto flags_out;
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
/*
* Changes to the journaling mode can cause unsafe changes to
- * S_DAX if we are using the DAX mount option.
+ * S_DAX if the inode is DAX
*/
- if (test_opt(inode->i_sb, DAX)) {
+ if (IS_DAX(inode)) {
err = -EBUSY;
goto flags_out;
}
- err = ext4_change_inode_journal_flag(inode, jflag);
+ err = ext4_change_inode_journal_flag(inode,
+ flags & EXT4_JOURNAL_DATA_FL);
if (err)
goto flags_out;
}
@@ -527,12 +565,15 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
xflags |= FS_XFLAG_NOATIME;
if (iflags & EXT4_PROJINHERIT_FL)
xflags |= FS_XFLAG_PROJINHERIT;
+ if (iflags & EXT4_DAX_FL)
+ xflags |= FS_XFLAG_DAX;
return xflags;
}
#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
- FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+ FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \
+ FS_XFLAG_DAX)
/* Transfer xflags flags to internal */
static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
@@ -551,6 +592,8 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
iflags |= EXT4_NOATIME_FL;
if (xflags & FS_XFLAG_PROJINHERIT)
iflags |= EXT4_PROJINHERIT_FL;
+ if (xflags & FS_XFLAG_DAX)
+ iflags |= EXT4_DAX_FL;
return iflags;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a9083113a8c0..c0a331e2feb0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4708,7 +4708,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
- seq = *this_cpu_ptr(&discard_pa_seq);
+ seq = this_cpu_read(discard_pa_seq);
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index de6fe969f773..defd2e10dfd1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -402,6 +402,7 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
@@ -418,7 +419,8 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
{
int ret;
- if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+ if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
+ !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
submit_and_retry:
ext4_io_submit(io);
}
@@ -506,7 +508,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* (e.g. holes) to be unnecessarily encrypted, but this is rare and
* can't happen in the common case of blocksize == PAGE_SIZE.
*/
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
unsigned int enc_bytes = round_up(len, i_blocksize(inode));
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5761e9961682..f2df2db0786c 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -195,7 +195,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio,
{
unsigned int post_read_steps = 0;
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= 1 << STEP_DECRYPT;
if (ext4_need_verity(inode, first_idx))
@@ -230,6 +230,7 @@ int ext4_mpage_readpages(struct inode *inode,
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
+ sector_t next_block;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
@@ -258,7 +259,8 @@ int ext4_mpage_readpages(struct inode *inode,
if (page_has_buffers(page))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
+ block_in_file = next_block =
+ (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (ext4_readpage_limit(inode) +
blocksize - 1) >> blkbits;
@@ -358,7 +360,8 @@ int ext4_mpage_readpages(struct inode *inode,
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != blocks[0] - 1)) {
+ if (bio && (last_block_in_bio != blocks[0] - 1 ||
+ !fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc:
submit_bio(bio);
bio = NULL;
@@ -370,6 +373,8 @@ int ext4_mpage_readpages(struct inode *inode,
*/
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
+ fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
+ GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c668f6b42374..0907f907c47d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -522,9 +522,6 @@ static void ext4_handle_error(struct super_block *sb)
smp_wmb();
sb->s_flags |= SB_RDONLY;
} else if (test_opt(sb, ERRORS_PANIC)) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
}
@@ -725,23 +722,20 @@ void __ext4_abort(struct super_block *sb, const char *function,
va_end(args);
if (sb_rdonly(sb) == 0) {
- ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ if (EXT4_SB(sb)->s_journal)
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
* Make sure updated value of ->s_mount_flags will be visible
* before ->s_flags update
*/
smp_wmb();
sb->s_flags |= SB_RDONLY;
- if (EXT4_SB(sb)->s_journal)
- jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
}
- if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
+ if (test_opt(sb, ERRORS_PANIC) && !system_going_down())
panic("EXT4-fs panic from previous error\n");
- }
}
void __ext4_msg(struct super_block *sb,
@@ -1324,6 +1318,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
return -EINVAL;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EOPNOTSUPP;
+
res = ext4_convert_inline_data(inode);
if (res)
return res;
@@ -1349,7 +1346,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
* Update inode->i_flags - S_ENCRYPTED will be enabled,
* S_DAX may be disabled
*/
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
}
return res;
}
@@ -1376,7 +1373,7 @@ retry:
* Update inode->i_flags - S_ENCRYPTED will be enabled,
* S_DAX may be disabled
*/
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
res = ext4_mark_inode_dirty(handle, inode);
if (res)
EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
@@ -1511,10 +1508,12 @@ enum {
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
+ Opt_inlinecrypt,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
+ Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
Opt_nowarn_on_error, Opt_mblk_io_submit,
Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
@@ -1581,6 +1580,9 @@ static const match_table_t tokens = {
{Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
{Opt_dax, "dax"},
+ {Opt_dax_always, "dax=always"},
+ {Opt_dax_inode, "dax=inode"},
+ {Opt_dax_never, "dax=never"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
{Opt_warn_on_error, "warn_on_error"},
@@ -1609,6 +1611,7 @@ static const match_table_t tokens = {
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_inlinecrypt, "inlinecrypt"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
@@ -1729,6 +1732,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
#define MOPT_NO_EXT3 0x0200
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
#define MOPT_STRING 0x0400
+#define MOPT_SKIP 0x0800
static const struct mount_opts {
int token;
@@ -1778,7 +1782,13 @@ static const struct mount_opts {
{Opt_min_batch_time, 0, MOPT_GTE0},
{Opt_inode_readahead_blks, 0, MOPT_GTE0},
{Opt_init_itable, 0, MOPT_GTE0},
- {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
+ {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
+ {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+ {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+ {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
{Opt_stripe, 0, MOPT_GTE0},
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
@@ -1938,6 +1948,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
case Opt_nolazytime:
sb->s_flags &= ~SB_LAZYTIME;
return 1;
+ case Opt_inlinecrypt:
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ sb->s_flags |= SB_INLINECRYPT;
+#else
+ ext4_msg(sb, KERN_ERR, "inline encryption not supported");
+#endif
+ return 1;
}
for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -2123,13 +2140,56 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
- } else if (token == Opt_dax) {
+ } else if (token == Opt_dax || token == Opt_dax_always ||
+ token == Opt_dax_inode || token == Opt_dax_never) {
#ifdef CONFIG_FS_DAX
- ext4_msg(sb, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- sbi->s_mount_opt |= m->mount_opt;
+ switch (token) {
+ case Opt_dax:
+ case Opt_dax_always:
+ if (is_remount &&
+ (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
+ fail_dax_change_remount:
+ ext4_msg(sb, KERN_ERR, "can't change "
+ "dax mount option while remounting");
+ return -1;
+ }
+ if (is_remount &&
+ (test_opt(sb, DATA_FLAGS) ==
+ EXT4_MOUNT_JOURNAL_DATA)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -1;
+ }
+ ext4_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
+ sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ break;
+ case Opt_dax_never:
+ if (is_remount &&
+ (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
+ goto fail_dax_change_remount;
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+ break;
+ case Opt_dax_inode:
+ if (is_remount &&
+ ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
+ goto fail_dax_change_remount;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+ sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ /* Strictly for printing options */
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
+ break;
+ }
#else
ext4_msg(sb, KERN_INFO, "dax option not supported");
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
return -1;
#endif
} else if (token == Opt_data_err_abort) {
@@ -2293,7 +2353,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
- (m->flags & MOPT_CLEAR_ERR))
+ (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
continue;
if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */
@@ -2353,6 +2413,20 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
fscrypt_show_test_dummy_encryption(seq, sep, sb);
+ if (sb->s_flags & SB_INLINECRYPT)
+ SEQ_OPTS_PUTS("inlinecrypt");
+
+ if (test_opt(sb, DAX_ALWAYS)) {
+ if (IS_EXT2_SB(sb))
+ SEQ_OPTS_PUTS("dax");
+ else
+ SEQ_OPTS_PUTS("dax=always");
+ } else if (test_opt2(sb, DAX_NEVER)) {
+ SEQ_OPTS_PUTS("dax=never");
+ } else if (test_opt2(sb, DAX_INODE)) {
+ SEQ_OPTS_PUTS("dax=inode");
+ }
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -2383,6 +2457,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
ext4_msg(sb, KERN_ERR, "revision level too high, "
"forcing read-only mode");
err = -EROFS;
+ goto done;
}
if (read_only)
goto done;
@@ -4017,7 +4092,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"both data=journal and delalloc");
goto failed_mount;
}
- if (test_opt(sb, DAX)) {
+ if (test_opt(sb, DAX_ALWAYS)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"both data=journal and dax");
goto failed_mount;
@@ -4127,13 +4202,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (bdev_dax_supported(sb->s_bdev, blocksize))
+ set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+
+ if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
if (ext4_has_feature_inline_data(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
" that may contain inline data");
goto failed_mount;
}
- if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
ext4_msg(sb, KERN_ERR,
"DAX unsupported by block device.");
goto failed_mount;
@@ -5447,12 +5525,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
- if (test_opt(sb, DAX)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- err = -EINVAL;
- goto restore_opts;
- }
} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -5468,12 +5540,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
- ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
- "dax flag with busy inodes while remounting");
- sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
- }
-
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index dec1244dd062..bbd5e7e0632b 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -113,6 +113,9 @@ static int ext4_begin_enable_verity(struct file *filp)
handle_t *handle;
int err;
+ if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EINVAL;
+
if (ext4_verity_in_progress(inode))
return -EBUSY;
@@ -241,7 +244,7 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
if (err)
goto out_stop;
ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
out_stop:
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 9b29a40738ac..7d2f6576d954 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -93,6 +93,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
#ifdef CONFIG_EXT4_FS_SECURITY
[EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
#endif
+ [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler,
};
const struct xattr_handler *ext4_xattr_handlers[] = {
@@ -105,6 +106,7 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
#endif
+ &ext4_xattr_hurd_handler,
NULL
};
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ffe21ac77f78..730b91fa0dd7 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -124,6 +124,7 @@ struct ext4_xattr_inode_array {
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_hurd_handler;
#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c
new file mode 100644
index 000000000000..8cfa74a56361
--- /dev/null
+++ b/fs/ext4/xattr_hurd.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/ext4/xattr_hurd.c
+ * Handler for extended gnu attributes for the Hurd.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, <janneke@gnu.org>
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static bool
+ext4_xattr_hurd_list(struct dentry *dentry)
+{
+ return test_opt(dentry->d_sb, XATTR_USER);
+}
+
+static int
+ext4_xattr_hurd_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD,
+ name, buffer, size);
+}
+
+static int
+ext4_xattr_hurd_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_hurd_handler = {
+ .prefix = XATTR_HURD_PREFIX,
+ .list = ext4_xattr_hurd_list,
+ .get = ext4_xattr_hurd_get,
+ .set = ext4_xattr_hurd_set,
+};
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 1e02a8c106b0..29e50fbe7eca 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1086,7 +1086,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
.submitted = false,
.io_type = io_type,
.io_wbc = wbc,
- .encrypted = f2fs_encrypted_file(cc->inode),
+ .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode),
};
struct dnode_of_data dn;
struct node_info ni;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 326c63879ddc..5f527073143e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -14,6 +14,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
+#include <linux/blk-crypto.h>
#include <linux/swap.h>
#include <linux/prefetch.h>
#include <linux/uio.h>
@@ -459,6 +460,33 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
return bio;
}
+static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
+ pgoff_t first_idx,
+ const struct f2fs_io_info *fio,
+ gfp_t gfp_mask)
+{
+ /*
+ * The f2fs garbage collector sets ->encrypted_page when it wants to
+ * read/write raw data without encryption.
+ */
+ if (!fio || !fio->encrypted_page)
+ fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask);
+}
+
+static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
+ pgoff_t next_idx,
+ const struct f2fs_io_info *fio)
+{
+ /*
+ * The f2fs garbage collector sets ->encrypted_page when it wants to
+ * read/write raw data without encryption.
+ */
+ if (fio && fio->encrypted_page)
+ return !bio_has_crypt_ctx(bio);
+
+ return fscrypt_mergeable_bio(bio, inode, next_idx);
+}
+
static inline void __submit_bio(struct f2fs_sb_info *sbi,
struct bio *bio, enum page_type type)
{
@@ -684,6 +712,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
/* Allocate a new bio */
bio = __bio_alloc(fio, 1);
+ f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
+ fio->page->index, fio, GFP_NOIO);
+
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
return -EFAULT;
@@ -763,9 +794,10 @@ static void del_bio_entry(struct bio_entry *be)
kmem_cache_free(bio_entry_slab, be);
}
-static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio,
+static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
struct page *page)
{
+ struct f2fs_sb_info *sbi = fio->sbi;
enum temp_type temp;
bool found = false;
int ret = -EAGAIN;
@@ -782,13 +814,19 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio,
found = true;
- if (bio_add_page(*bio, page, PAGE_SIZE, 0) ==
- PAGE_SIZE) {
+ f2fs_bug_on(sbi, !page_is_mergeable(sbi, *bio,
+ *fio->last_block,
+ fio->new_blkaddr));
+ if (f2fs_crypt_mergeable_bio(*bio,
+ fio->page->mapping->host,
+ fio->page->index, fio) &&
+ bio_add_page(*bio, page, PAGE_SIZE, 0) ==
+ PAGE_SIZE) {
ret = 0;
break;
}
- /* bio is full */
+ /* page can't be merged into bio; submit the bio */
del_bio_entry(be);
__submit_bio(sbi, *bio, DATA);
break;
@@ -880,11 +918,13 @@ alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_PAGES);
__attach_io_flag(fio);
+ f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
+ fio->page->index, fio, GFP_NOIO);
bio_set_op_attrs(bio, fio->op, fio->op_flags);
add_bio_entry(fio->sbi, bio, page, fio->temp);
} else {
- if (add_ipu_page(fio->sbi, &bio, page))
+ if (add_ipu_page(fio, &bio, page))
goto alloc_new;
}
@@ -936,8 +976,11 @@ next:
inc_page_count(sbi, WB_DATA_TYPE(bio_page));
- if (io->bio && !io_is_mergeable(sbi, io->bio, io, fio,
- io->last_block_in_bio, fio->new_blkaddr))
+ if (io->bio &&
+ (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
+ fio->new_blkaddr) ||
+ !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host,
+ bio_page->index, fio)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
@@ -949,6 +992,8 @@ alloc_new:
goto skip;
}
io->bio = __bio_alloc(fio, BIO_MAX_PAGES);
+ f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
+ bio_page->index, fio, GFP_NOIO);
io->fio = *fio;
}
@@ -993,11 +1038,14 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
for_write);
if (!bio)
return ERR_PTR(-ENOMEM);
+
+ f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS);
+
f2fs_target_device(sbi, blkaddr, bio);
bio->bi_end_io = f2fs_read_end_io;
bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
- if (f2fs_encrypted_file(inode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= 1 << STEP_DECRYPT;
if (f2fs_compressed_file(inode))
post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ;
@@ -2073,8 +2121,9 @@ zero_out:
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && !page_is_mergeable(F2FS_I_SB(inode), bio,
- *last_block_in_bio, block_nr)) {
+ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+ *last_block_in_bio, block_nr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
submit_and_realloc:
__submit_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
@@ -2204,8 +2253,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i + 1);
- if (bio && !page_is_mergeable(sbi, bio,
- *last_block_in_bio, blkaddr)) {
+ if (bio && (!page_is_mergeable(sbi, bio,
+ *last_block_in_bio, blkaddr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
submit_and_realloc:
__submit_bio(sbi, bio, DATA);
bio = NULL;
@@ -2421,6 +2471,9 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
+ if (fscrypt_inode_uses_inline_crypto(inode))
+ return 0;
+
retry_encrypt:
fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page,
PAGE_SIZE, 0, gfp_flags);
@@ -2594,7 +2647,7 @@ got_it:
f2fs_unlock_op(fio->sbi);
err = f2fs_inplace_write_data(fio);
if (err) {
- if (f2fs_encrypted_file(inode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
if (PageWriteback(page))
end_page_writeback(page);
@@ -2856,7 +2909,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
};
#endif
int nr_pages;
- pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
@@ -2875,8 +2927,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
clear_inode_flag(mapping->host, FI_HOT_DATA);
if (wbc->range_cyclic) {
- writeback_index = mapping->writeback_index; /* prev offset */
- index = writeback_index;
+ index = mapping->writeback_index; /* prev offset */
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 20e56b0fa46a..23c49c313fb6 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -138,6 +138,7 @@ enum {
Opt_alloc,
Opt_fsync,
Opt_test_dummy_encryption,
+ Opt_inlinecrypt,
Opt_checkpoint_disable,
Opt_checkpoint_disable_cap,
Opt_checkpoint_disable_cap_perc,
@@ -204,6 +205,7 @@ static match_table_t f2fs_tokens = {
{Opt_fsync, "fsync_mode=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_inlinecrypt, "inlinecrypt"},
{Opt_checkpoint_disable, "checkpoint=disable"},
{Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
{Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
@@ -833,6 +835,13 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
if (ret)
return ret;
break;
+ case Opt_inlinecrypt:
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ sb->s_flags |= SB_INLINECRYPT;
+#else
+ f2fs_info(sbi, "inline encryption not supported");
+#endif
+ break;
case Opt_checkpoint_disable_cap_perc:
if (args->from && match_int(args, &arg))
return -EINVAL;
@@ -1590,6 +1599,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb);
+ if (sbi->sb->s_flags & SB_INLINECRYPT)
+ seq_puts(seq, ",inlinecrypt");
+
if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT)
seq_printf(seq, ",alloc_mode=%s", "default");
else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
@@ -1624,6 +1636,8 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).compress_ext_cnt = 0;
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
+ sbi->sb->s_flags &= ~SB_INLINECRYPT;
+
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
@@ -2470,6 +2484,25 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
*lblk_bits_ret = 8 * sizeof(block_t);
}
+static int f2fs_get_num_devices(struct super_block *sb)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (f2fs_is_multi_device(sbi))
+ return sbi->s_ndevs;
+ return 1;
+}
+
+static void f2fs_get_devices(struct super_block *sb,
+ struct request_queue **devs)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ devs[i] = bdev_get_queue(FDEV(i).bdev);
+}
+
static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
@@ -2479,6 +2512,8 @@ static const struct fscrypt_operations f2fs_cryptops = {
.max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
.get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
+ .get_num_devices = f2fs_get_num_devices,
+ .get_devices = f2fs_get_devices,
};
#endif
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index b4ddf48fa444..c4a274285858 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1284,7 +1284,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
struct super_block *sb = dir->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
- struct msdos_dir_entry *uninitialized_var(de);
+ struct msdos_dir_entry *de;
int err, free_slots, i, nr_bhs;
loff_t pos, i_pos;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index c23f6f243ad4..a1303ad303ba 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -120,7 +120,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
- unsigned uninitialized_var(val);
+ unsigned val;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -162,7 +162,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
- unsigned uninitialized_var(val);
+ unsigned val;
struct fuse_conn *fc;
ssize_t ret;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 030f094910c3..2cc17816d7b1 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -270,7 +270,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
{
char *end = p + len;
- char *uninitialized_var(key), *uninitialized_var(val);
+ char *key, *val;
int rc;
while (true) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e573b0cd2737..6611ef3269a8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -18,6 +18,7 @@
#include <linux/swap.h>
#include <linux/falloc.h>
#include <linux/uio.h>
+#include <linux/fs.h>
static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
struct fuse_page_desc **desc)
@@ -1586,7 +1587,6 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- rb_erase(&wpa->writepages_entry, &fi->writepages);
for (i = 0; i < ap->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
@@ -1637,6 +1637,7 @@ __acquires(fi->lock)
out_free:
fi->writectr--;
+ rb_erase(&wpa->writepages_entry, &fi->writepages);
fuse_writepage_finish(fc, wpa);
spin_unlock(&fi->lock);
@@ -1674,7 +1675,8 @@ __acquires(fi->lock)
}
}
-static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
+static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
+ struct fuse_writepage_args *wpa)
{
pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
@@ -1697,11 +1699,17 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
else if (idx_to < curr_index)
p = &(*p)->rb_left;
else
- return (void) WARN_ON(true);
+ return curr;
}
rb_link_node(&wpa->writepages_entry, parent, p);
rb_insert_color(&wpa->writepages_entry, root);
+ return NULL;
+}
+
+static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
+{
+ WARN_ON(fuse_insert_writeback(root, wpa));
}
static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
@@ -1714,6 +1722,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
mapping_set_error(inode->i_mapping, error);
spin_lock(&fi->lock);
+ rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_write_in *inarg = &wpa->ia.write.in;
@@ -1952,14 +1961,14 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
}
/*
- * First recheck under fi->lock if the offending offset is still under
- * writeback. If yes, then iterate auxiliary write requests, to see if there's
+ * Check under fi->lock if the page is under writeback, and insert it onto the
+ * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
* one already added for a page at this offset. If there's none, then insert
* this new request onto the auxiliary list, otherwise reuse the existing one by
- * copying the new page contents over to the old temporary page.
+ * swapping the new temp page with the old one.
*/
-static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
- struct page *page)
+static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
+ struct page *page)
{
struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
struct fuse_writepage_args *tmp;
@@ -1967,17 +1976,15 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
WARN_ON(new_ap->num_pages != 0);
+ new_ap->num_pages = 1;
spin_lock(&fi->lock);
- rb_erase(&new_wpa->writepages_entry, &fi->writepages);
- old_wpa = fuse_find_writeback(fi, page->index, page->index);
+ old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
if (!old_wpa) {
- tree_insert(&fi->writepages, new_wpa);
spin_unlock(&fi->lock);
- return false;
+ return true;
}
- new_ap->num_pages = 1;
for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
pgoff_t curr_index;
@@ -2006,7 +2013,41 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
fuse_writepage_free(new_wpa);
}
- return true;
+ return false;
+}
+
+static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
+ struct fuse_args_pages *ap,
+ struct fuse_fill_wb_data *data)
+{
+ WARN_ON(!ap->num_pages);
+
+ /*
+ * Being under writeback is unlikely but possible. For example direct
+ * read to an mmaped fuse file will set the page dirty twice; once when
+ * the pages are faulted with get_user_pages(), and then after the read
+ * completed.
+ */
+ if (fuse_page_is_writeback(data->inode, page->index))
+ return true;
+
+ /* Reached max pages */
+ if (ap->num_pages == fc->max_pages)
+ return true;
+
+ /* Reached max write bytes */
+ if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
+ return true;
+
+ /* Discontinuity */
+ if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
+ return true;
+
+ /* Need to grow the pages array? If so, did the expansion fail? */
+ if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
+ return true;
+
+ return false;
}
static int fuse_writepages_fill(struct page *page,
@@ -2019,7 +2060,6 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct page *tmp_page;
- bool is_writeback;
int err;
if (!data->ff) {
@@ -2029,25 +2069,9 @@ static int fuse_writepages_fill(struct page *page,
goto out_unlock;
}
- /*
- * Being under writeback is unlikely but possible. For example direct
- * read to an mmaped fuse file will set the page dirty twice; once when
- * the pages are faulted with get_user_pages(), and then after the read
- * completed.
- */
- is_writeback = fuse_page_is_writeback(inode, page->index);
-
- if (wpa && ap->num_pages &&
- (is_writeback || ap->num_pages == fc->max_pages ||
- (ap->num_pages + 1) * PAGE_SIZE > fc->max_write ||
- data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) {
+ if (wpa && fuse_writepage_need_send(fc, page, ap, data)) {
fuse_writepages_send(data);
data->wpa = NULL;
- } else if (wpa && ap->num_pages == data->max_pages) {
- if (!fuse_pages_realloc(data)) {
- fuse_writepages_send(data);
- data->wpa = NULL;
- }
}
err = -ENOMEM;
@@ -2085,12 +2109,6 @@ static int fuse_writepages_fill(struct page *page,
ap->args.end = fuse_writepage_end;
ap->num_pages = 0;
wpa->inode = inode;
-
- spin_lock(&fi->lock);
- tree_insert(&fi->writepages, wpa);
- spin_unlock(&fi->lock);
-
- data->wpa = wpa;
}
set_page_writeback(page);
@@ -2098,26 +2116,25 @@ static int fuse_writepages_fill(struct page *page,
ap->pages[ap->num_pages] = tmp_page;
ap->descs[ap->num_pages].offset = 0;
ap->descs[ap->num_pages].length = PAGE_SIZE;
+ data->orig_pages[ap->num_pages] = page;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
- if (is_writeback && fuse_writepage_in_flight(wpa, page)) {
+ if (data->wpa) {
+ /*
+ * Protected by fi->lock against concurrent access by
+ * fuse_page_is_writeback().
+ */
+ spin_lock(&fi->lock);
+ ap->num_pages++;
+ spin_unlock(&fi->lock);
+ } else if (fuse_writepage_add(wpa, page)) {
+ data->wpa = wpa;
+ } else {
end_page_writeback(page);
- data->wpa = NULL;
- goto out_unlock;
}
- data->orig_pages[ap->num_pages] = page;
-
- /*
- * Protected by fi->lock against concurrent access by
- * fuse_page_is_writeback().
- */
- spin_lock(&fi->lock);
- ap->num_pages++;
- spin_unlock(&fi->lock);
-
out_unlock:
unlock_page(page);
@@ -2149,10 +2166,8 @@ static int fuse_writepages(struct address_space *mapping,
err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
if (data.wpa) {
- /* Ignore errors if we can write at least one page */
WARN_ON(!data.wpa->ia.ap.num_pages);
fuse_writepages_send(&data);
- err = 0;
}
if (data.ff)
fuse_file_put(data.ff, false, false);
@@ -2761,7 +2776,16 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
struct iovec *iov = iov_page;
iov->iov_base = (void __user *)arg;
- iov->iov_len = _IOC_SIZE(cmd);
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ case FS_IOC_SETFLAGS:
+ iov->iov_len = sizeof(int);
+ break;
+ default:
+ iov->iov_len = _IOC_SIZE(cmd);
+ break;
+ }
if (_IOC_DIR(cmd) & _IOC_WRITE) {
in_iov = iov;
@@ -2963,7 +2987,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
{
spin_lock(&fc->lock);
if (RB_EMPTY_NODE(&ff->polled_node)) {
- struct rb_node **link, *uninitialized_var(parent);
+ struct rb_node **link, *parent;
link = fuse_find_polled_node(fc, ff->kh, &parent);
BUG_ON(*link);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5b4aebf5821f..bba747520e9b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -121,10 +121,12 @@ static void fuse_evict_inode(struct inode *inode)
}
}
-static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+static int fuse_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+
sync_filesystem(sb);
- if (*flags & SB_MANDLOCK)
+ if (fc->sb_flags & SB_MANDLOCK)
return -EINVAL;
return 0;
@@ -475,6 +477,17 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct fuse_fs_context *ctx = fc->fs_private;
int opt;
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ /*
+ * Ignore options coming from mount(MS_REMOUNT) for backward
+ * compatibility.
+ */
+ if (fc->oldapi)
+ return 0;
+
+ return invalfc(fc, "No changes allowed in reconfigure");
+ }
+
opt = fs_parse(fc, fuse_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -817,7 +830,6 @@ static const struct super_operations fuse_super_operations = {
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
.drop_inode = generic_delete_inode,
- .remount_fs = fuse_remount_fs,
.put_super = fuse_put_super,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
@@ -1296,6 +1308,7 @@ static int fuse_get_tree(struct fs_context *fc)
static const struct fs_context_operations fuse_context_ops = {
.free = fuse_free_fc,
.parse_param = fuse_parse_param,
+ .reconfigure = fuse_reconfigure,
.get_tree = fuse_get_tree,
};
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 72c9560f4467..d4af283fc888 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -335,7 +335,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
int done = 0;
struct pagevec pvec;
int nr_pages;
- pgoff_t uninitialized_var(writeback_index);
+ pgoff_t writeback_index;
pgoff_t index;
pgoff_t end;
pgoff_t done_index;
@@ -468,21 +468,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
}
-/**
- * __gfs2_readpage - readpage
- * @file: The file to read a page for
- * @page: The page to read
- *
- * This is the core of gfs2's readpage. It's used by the internal file
- * reading code as in that case we already hold the glock. Also it's
- * called by gfs2_readpage() once the required lock has been granted.
- */
-
static int __gfs2_readpage(void *file, struct page *page)
{
struct gfs2_inode *ip = GFS2_I(page->mapping->host);
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-
int error;
if (i_blocksize(page->mapping->host) == PAGE_SIZE &&
@@ -505,36 +494,11 @@ static int __gfs2_readpage(void *file, struct page *page)
* gfs2_readpage - read a page of a file
* @file: The file to read
* @page: The page of the file
- *
- * This deals with the locking required. We have to unlock and
- * relock the page in order to get the locking in the right
- * order.
*/
static int gfs2_readpage(struct file *file, struct page *page)
{
- struct address_space *mapping = page->mapping;
- struct gfs2_inode *ip = GFS2_I(mapping->host);
- struct gfs2_holder gh;
- int error;
-
- unlock_page(page);
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
- error = gfs2_glock_nq(&gh);
- if (unlikely(error))
- goto out;
- error = AOP_TRUNCATED_PAGE;
- lock_page(page);
- if (page->mapping == mapping && !PageUptodate(page))
- error = __gfs2_readpage(file, page);
- else
- unlock_page(page);
- gfs2_glock_dq(&gh);
-out:
- gfs2_holder_uninit(&gh);
- if (error && error != AOP_TRUNCATED_PAGE)
- lock_page(page);
- return error;
+ return __gfs2_readpage(file, page);
}
/**
@@ -598,16 +562,9 @@ static void gfs2_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
- if (gfs2_glock_nq(&gh))
- goto out_uninit;
if (!gfs2_is_stuffed(ip))
mpage_readahead(rac, gfs2_block_map);
- gfs2_glock_dq(&gh);
-out_uninit:
- gfs2_holder_uninit(&gh);
}
/**
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6306eaae378b..8dfe09f52cbc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1761,7 +1761,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
__u16 start_list[GFS2_MAX_META_HEIGHT];
__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
- unsigned int start_aligned, uninitialized_var(end_aligned);
+ unsigned int start_aligned, end_aligned;
unsigned int strip_h = ip->i_height - 1;
u32 btotal = 0;
int ret, state;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fe305e4bfd37..bebde537ac8c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -558,8 +558,29 @@ out_uninit:
return block_page_mkwrite_return(ret);
}
+static vm_fault_t gfs2_fault(struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ vm_fault_t ret;
+ int err;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ err = gfs2_glock_nq(&gh);
+ if (err) {
+ ret = block_page_mkwrite_return(err);
+ goto out_uninit;
+ }
+ ret = filemap_fault(vmf);
+ gfs2_glock_dq(&gh);
+out_uninit:
+ gfs2_holder_uninit(&gh);
+ return ret;
+}
+
static const struct vm_operations_struct gfs2_vm_ops = {
- .fault = filemap_fault,
+ .fault = gfs2_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = gfs2_page_mkwrite,
};
@@ -824,6 +845,9 @@ out_uninit:
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct gfs2_inode *ip;
+ struct gfs2_holder gh;
+ size_t written = 0;
ssize_t ret;
if (iocb->ki_flags & IOCB_DIRECT) {
@@ -832,7 +856,31 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
iocb->ki_flags &= ~IOCB_DIRECT;
}
- return generic_file_read_iter(iocb, to);
+ iocb->ki_flags |= IOCB_NOIO;
+ ret = generic_file_read_iter(iocb, to);
+ iocb->ki_flags &= ~IOCB_NOIO;
+ if (ret >= 0) {
+ if (!iov_iter_count(to))
+ return ret;
+ written = ret;
+ } else {
+ if (ret != -EAGAIN)
+ return ret;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return ret;
+ }
+ ip = GFS2_I(iocb->ki_filp->f_mapping->host);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret)
+ goto out_uninit;
+ ret = generic_file_read_iter(iocb, to);
+ if (ret > 0)
+ written += ret;
+ gfs2_glock_dq(&gh);
+out_uninit:
+ gfs2_holder_uninit(&gh);
+ return written ? written : ret;
}
/**
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 2299dcc417ea..8545024a1401 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1899,7 +1899,10 @@ bool gfs2_delete_work_queued(const struct gfs2_glock *gl)
static void flush_delete_work(struct gfs2_glock *gl)
{
- flush_delayed_work(&gl->gl_delete);
+ if (cancel_delayed_work(&gl->gl_delete)) {
+ queue_delayed_work(gfs2_delete_workqueue,
+ &gl->gl_delete, 0);
+ }
gfs2_glock_queue_work(gl, 0);
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c84887769b5a..de1d5f1d9ff8 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -531,8 +531,7 @@ static int freeze_go_sync(struct gfs2_glock *gl)
int error = 0;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) &&
- test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ if (gl->gl_req == LM_ST_EXCLUSIVE && !gfs2_withdrawn(sdp)) {
atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
error = freeze_super(sdp->sd_vfs);
if (error) {
@@ -545,8 +544,11 @@ static int freeze_go_sync(struct gfs2_glock *gl)
gfs2_assert_withdraw(sdp, 0);
}
queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
- gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
- GFS2_LFC_FREEZE_GO_SYNC);
+ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+ gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
+ GFS2_LFC_FREEZE_GO_SYNC);
+ else /* read-only mounts */
+ atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
}
return 0;
}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 03ab11fab962..ca2ec02436ec 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -399,7 +399,6 @@ enum {
GIF_QD_LOCKED = 1,
GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
- GIF_ORDERED = 4,
GIF_FREE_VFS_INODE = 5,
GIF_GLOP_PENDING = 6,
GIF_DEFERRED_DELETE = 7,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 370c3a4b31ac..6774865f5b5b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -207,10 +207,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (no_formal_ino && ip->i_no_formal_ino &&
no_formal_ino != ip->i_no_formal_ino) {
+ error = -ESTALE;
if (inode->i_state & I_NEW)
goto fail;
iput(inode);
- return ERR_PTR(-ESTALE);
+ return ERR_PTR(error);
}
if (inode->i_state & I_NEW)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 3e4734431783..a76e55bc28eb 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -613,6 +613,12 @@ static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
+static void __ordered_del_inode(struct gfs2_inode *ip)
+{
+ if (!list_empty(&ip->i_ordered))
+ list_del_init(&ip->i_ordered);
+}
+
static void gfs2_ordered_write(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip;
@@ -623,8 +629,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
while (!list_empty(&sdp->sd_log_ordered)) {
ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
if (ip->i_inode.i_mapping->nrpages == 0) {
- test_and_clear_bit(GIF_ORDERED, &ip->i_flags);
- list_del(&ip->i_ordered);
+ __ordered_del_inode(ip);
continue;
}
list_move(&ip->i_ordered, &written);
@@ -643,8 +648,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
spin_lock(&sdp->sd_ordered_lock);
while (!list_empty(&sdp->sd_log_ordered)) {
ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
- list_del(&ip->i_ordered);
- WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
+ __ordered_del_inode(ip);
if (ip->i_inode.i_mapping->nrpages == 0)
continue;
spin_unlock(&sdp->sd_ordered_lock);
@@ -659,8 +663,7 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
spin_lock(&sdp->sd_ordered_lock);
- if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags))
- list_del(&ip->i_ordered);
+ __ordered_del_inode(ip);
spin_unlock(&sdp->sd_ordered_lock);
}
@@ -1002,6 +1005,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
out:
if (gfs2_withdrawn(sdp)) {
+ /**
+ * If the tr_list is empty, we're withdrawing during a log
+ * flush that targets a transaction, but the transaction was
+ * never queued onto any of the ail lists. Here we add it to
+ * ail1 just so that ail_drain() will find and free it.
+ */
+ spin_lock(&sdp->sd_ail_lock);
+ if (tr && list_empty(&tr->tr_list))
+ list_add(&tr->tr_list, &sdp->sd_ail1_list);
+ spin_unlock(&sdp->sd_ail_lock);
ail_drain(sdp); /* frees all transactions */
tr = NULL;
}
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index c1cd6ae17659..8965c751a303 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -53,9 +53,9 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
if (gfs2_is_jdata(ip) || !gfs2_is_ordered(sdp))
return;
- if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
+ if (list_empty(&ip->i_ordered)) {
spin_lock(&sdp->sd_ordered_lock);
- if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
+ if (list_empty(&ip->i_ordered))
list_add(&ip->i_ordered, &sdp->sd_log_ordered);
spin_unlock(&sdp->sd_ordered_lock);
}
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index cb2a11b458c6..ed1da4323967 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -419,7 +419,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
struct page *page)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- struct gfs2_log_header_host uninitialized_var(lh);
+ struct gfs2_log_header_host lh;
void *kaddr = kmap_atomic(page);
unsigned int offset;
bool ret = false;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 733470ca6be9..c7393ee9cf68 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -39,6 +39,7 @@ static void gfs2_init_inode_once(void *foo)
atomic_set(&ip->i_sizehint, 0);
init_rwsem(&ip->i_rw_mutex);
INIT_LIST_HEAD(&ip->i_trunc_list);
+ INIT_LIST_HEAD(&ip->i_ordered);
ip->i_qadata = NULL;
gfs2_holder_mark_uninitialized(&ip->i_rgd_gh);
memset(&ip->i_res, 0, sizeof(ip->i_res));
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 094f5fe7c009..6d18d2c91add 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1136,7 +1136,18 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
goto fail_per_node;
}
- if (!sb_rdonly(sb)) {
+ if (sb_rdonly(sb)) {
+ struct gfs2_holder freeze_gh;
+
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT,
+ &freeze_gh);
+ if (error) {
+ fs_err(sdp, "can't make FS RO: %d\n", error);
+ goto fail_per_node;
+ }
+ gfs2_glock_dq_uninit(&freeze_gh);
+ } else {
error = gfs2_make_fs_rw(sdp);
if (error) {
fs_err(sdp, "can't make FS RW: %d\n", error);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 96c345f49273..390ea79d682c 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -364,8 +364,8 @@ void gfs2_recover_func(struct work_struct *work)
/* Acquire a shared hold on the freeze lock */
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
- LM_FLAG_NOEXP | LM_FLAG_PRIORITY,
- &thaw_gh);
+ LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
+ GL_EXACT, &thaw_gh);
if (error)
goto fail_gunlock_ji;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 32d8d26126a1..47d0ae158b69 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -167,7 +167,8 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
if (error)
return error;
- error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT,
&freeze_gh);
if (error)
goto fail_threads;
@@ -203,7 +204,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
return 0;
fail:
- freeze_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&freeze_gh);
fail_threads:
if (sdp->sd_quotad_process)
@@ -430,7 +430,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
}
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
- GL_NOCACHE, &sdp->sd_freeze_gh);
+ LM_FLAG_NOEXP, &sdp->sd_freeze_gh);
if (error)
goto out;
@@ -613,13 +613,15 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
!gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) {
if (!log_write_allowed) {
error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
- LM_ST_SHARED, GL_NOCACHE |
- LM_FLAG_TRY, &freeze_gh);
+ LM_ST_SHARED, LM_FLAG_TRY |
+ LM_FLAG_NOEXP | GL_EXACT,
+ &freeze_gh);
if (error == GLR_TRYFAILED)
error = 0;
} else {
error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
- LM_ST_SHARED, GL_NOCACHE,
+ LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT,
&freeze_gh);
if (error && !gfs2_withdrawn(sdp))
return error;
@@ -761,8 +763,8 @@ void gfs2_freeze_func(struct work_struct *work)
struct super_block *sb = sdp->sd_vfs;
atomic_inc(&sb->s_active);
- error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
- &freeze_gh);
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT, &freeze_gh);
if (error) {
fs_info(sdp, "GFS2: couldn't get freeze lock : %d\n", error);
gfs2_assert_withdraw(sdp, 0);
@@ -774,8 +776,6 @@ void gfs2_freeze_func(struct work_struct *work)
error);
gfs2_assert_withdraw(sdp, 0);
}
- if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
- freeze_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&freeze_gh);
}
deactivate_super(sb);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2f224b98ee94..f35a37c65e5f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -17,6 +17,7 @@
#include <linux/cred.h>
#include <linux/uio.h>
#include <linux/xattr.h>
+#include <linux/blkdev.h>
#include "hfs_fs.h"
#include "btree.h"
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index c8d1b2be7854..73342c925a4b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -398,7 +398,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
astr = str->name;
len = str->len;
while (len > 0) {
- int uninitialized_var(dsize);
+ int dsize;
size = asc2unichar(sb, astr, len, &c);
astr += size;
len -= size;
diff --git a/fs/internal.h b/fs/internal.h
index 9b863a7bd708..969988d3d397 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,7 +23,9 @@ struct user_namespace;
extern void __init bdev_cache_init(void);
extern int __sync_blockdev(struct block_device *bdev, int wait);
-
+void iterate_bdevs(void (*)(struct block_device *, void *), void *);
+void emergency_thaw_bdev(struct super_block *sb);
+void bd_forget(struct inode *inode);
#else
static inline void bdev_cache_init(void)
{
@@ -33,7 +35,18 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
{
return 0;
}
-#endif
+static inline void iterate_bdevs(void (*f)(struct block_device *, void *),
+ void *arg)
+{
+}
+static inline int emergency_thaw_bdev(struct super_block *sb)
+{
+ return 0;
+}
+static inline void bd_forget(struct inode *inode)
+{
+}
+#endif /* CONFIG_BLOCK */
/*
* buffer.c
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 0b65a912b036..e92c4724480c 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -462,6 +462,7 @@ static void io_impersonate_work(struct io_worker *worker,
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
}
static void io_assign_current_work(struct io_worker *worker,
@@ -489,7 +490,6 @@ static void io_worker_handle_work(struct io_worker *worker)
do {
struct io_wq_work *work;
- unsigned int hash;
get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
@@ -512,6 +512,7 @@ get_next:
/* handle a whole dependent link */
do {
struct io_wq_work *old_work, *next_hashed, *linked;
+ unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work);
io_impersonate_work(worker, work);
@@ -522,10 +523,8 @@ get_next:
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
- hash = io_get_work_hash(work);
- linked = old_work = work;
- wq->do_work(&linked);
- linked = (old_work == linked) ? NULL : linked;
+ old_work = work;
+ linked = wq->do_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
@@ -542,8 +541,6 @@ get_next:
spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- /* dependent work is not hashed */
- hash = -1U;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
@@ -781,8 +778,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
- wq->do_work(&work);
- work = (work == old_work) ? NULL : work;
+ work = wq->do_work(work);
wq->free_work(old_work);
} while (work);
}
@@ -903,13 +899,15 @@ void io_wq_cancel_all(struct io_wq *wq)
struct io_cb_cancel_data {
work_cancel_fn *fn;
void *data;
+ int nr_running;
+ int nr_pending;
+ bool cancel_all;
};
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *match = data;
unsigned long flags;
- bool ret = false;
/*
* Hold the lock to avoid ->cur_work going out of scope, caller
@@ -920,74 +918,90 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
- ret = true;
+ match->nr_running++;
}
spin_unlock_irqrestore(&worker->lock, flags);
- return ret;
+ return match->nr_running && !match->cancel_all;
}
-static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
- struct io_cb_cancel_data *match)
+static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
+ struct io_cb_cancel_data *match)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
- bool found = false;
- /*
- * First check pending list, if we're lucky we can just remove it
- * from there. CANCEL_OK means that the work is returned as-new,
- * no completion will be posted for it.
- */
+retry:
spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
+ if (!match->fn(work, match->data))
+ continue;
- if (match->fn(work, match->data)) {
- wq_list_del(&wqe->work_list, node, prev);
- found = true;
- break;
- }
- }
- spin_unlock_irqrestore(&wqe->lock, flags);
-
- if (found) {
+ wq_list_del(&wqe->work_list, node, prev);
+ spin_unlock_irqrestore(&wqe->lock, flags);
io_run_cancel(work, wqe);
- return IO_WQ_CANCEL_OK;
+ match->nr_pending++;
+ if (!match->cancel_all)
+ return;
+
+ /* not safe to continue after unlock */
+ goto retry;
}
+ spin_unlock_irqrestore(&wqe->lock, flags);
+}
- /*
- * Now check if a free (going busy) or busy worker has the work
- * currently running. If we find it there, we'll return CANCEL_RUNNING
- * as an indication that we attempt to signal cancellation. The
- * completion will run normally in this case.
- */
+static void io_wqe_cancel_running_work(struct io_wqe *wqe,
+ struct io_cb_cancel_data *match)
+{
rcu_read_lock();
- found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
+ io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
rcu_read_unlock();
- return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data)
+ void *data, bool cancel_all)
{
struct io_cb_cancel_data match = {
- .fn = cancel,
- .data = data,
+ .fn = cancel,
+ .data = data,
+ .cancel_all = cancel_all,
};
- enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
+ /*
+ * First check pending list, if we're lucky we can just remove it
+ * from there. CANCEL_OK means that the work is returned as-new,
+ * no completion will be posted for it.
+ */
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- ret = io_wqe_cancel_work(wqe, &match);
- if (ret != IO_WQ_CANCEL_NOTFOUND)
- break;
+ io_wqe_cancel_pending_work(wqe, &match);
+ if (match.nr_pending && !match.cancel_all)
+ return IO_WQ_CANCEL_OK;
}
- return ret;
+ /*
+ * Now check if a free (going busy) or busy worker has the work
+ * currently running. If we find it there, we'll return CANCEL_RUNNING
+ * as an indication that we attempt to signal cancellation. The
+ * completion will run normally in this case.
+ */
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+
+ io_wqe_cancel_running_work(wqe, &match);
+ if (match.nr_running && !match.cancel_all)
+ return IO_WQ_CANCEL_RUNNING;
+ }
+
+ if (match.nr_running)
+ return IO_WQ_CANCEL_RUNNING;
+ if (match.nr_pending)
+ return IO_WQ_CANCEL_OK;
+ return IO_WQ_CANCEL_NOTFOUND;
}
static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
@@ -997,21 +1011,7 @@ static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
- return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
-}
-
-static bool io_wq_pid_match(struct io_wq_work *work, void *data)
-{
- pid_t pid = (pid_t) (unsigned long) data;
-
- return work->task_pid == pid;
-}
-
-enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
-{
- void *data = (void *) (unsigned long) pid;
-
- return io_wq_cancel_cb(wq, io_wq_pid_match, data);
+ return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false);
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 8e138fa88b9f..ddaf9614cf9b 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -5,10 +5,10 @@ struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
- IO_WQ_WORK_HASHED = 4,
- IO_WQ_WORK_UNBOUND = 32,
- IO_WQ_WORK_NO_CANCEL = 256,
- IO_WQ_WORK_CONCURRENT = 512,
+ IO_WQ_WORK_HASHED = 2,
+ IO_WQ_WORK_UNBOUND = 4,
+ IO_WQ_WORK_NO_CANCEL = 8,
+ IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -89,8 +89,8 @@ struct io_wq_work {
struct mm_struct *mm;
const struct cred *creds;
struct fs_struct *fs;
+ unsigned long fsize;
unsigned flags;
- pid_t task_pid;
};
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
@@ -102,7 +102,7 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
}
typedef void (free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work **);
+typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
@@ -125,12 +125,11 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work)
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
-enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data);
+ void *data, bool cancel_all);
struct task_struct *io_wq_get_task(struct io_wq *wq);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 155f3d830ddb..2a3af95be4ca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,6 +78,7 @@
#include <linux/fs_struct.h>
#include <linux/splice.h>
#include <linux/task_work.h>
+#include <linux/pagemap.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -226,7 +227,7 @@ struct io_ring_ctx {
struct {
unsigned int flags;
unsigned int compat: 1;
- unsigned int account_mem: 1;
+ unsigned int limit_mem: 1;
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
@@ -319,12 +320,12 @@ struct io_ring_ctx {
spinlock_t completion_lock;
/*
- * ->poll_list is protected by the ctx->uring_lock for
+ * ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head poll_list;
+ struct list_head iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_file;
@@ -395,6 +396,7 @@ struct io_timeout {
int flags;
u32 off;
u32 target_seq;
+ struct list_head list;
};
struct io_rw {
@@ -413,7 +415,7 @@ struct io_connect {
struct io_sr_msg {
struct file *file;
union {
- struct user_msghdr __user *msg;
+ struct user_msghdr __user *umsg;
void __user *buf;
};
int msg_flags;
@@ -486,6 +488,12 @@ struct io_statx {
struct statx __user *buffer;
};
+struct io_completion {
+ struct file *file;
+ struct list_head list;
+ int cflags;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -503,6 +511,7 @@ struct io_async_rw {
struct iovec *iov;
ssize_t nr_segs;
ssize_t size;
+ struct wait_page_queue wpq;
};
struct io_async_ctx {
@@ -523,24 +532,20 @@ enum {
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_LINK_HEAD_BIT,
- REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
- REQ_F_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
- REQ_F_MUST_PUNT_BIT,
- REQ_F_TIMEOUT_NOSEQ_BIT,
REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_OVERFLOW_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
- REQ_F_QUEUE_TIMEOUT_BIT,
REQ_F_WORK_INITIALIZED_BIT,
+ REQ_F_TASK_PINNED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -562,8 +567,6 @@ enum {
/* head of a link */
REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
- /* already grabbed next link */
- REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
@@ -574,14 +577,8 @@ enum {
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
- /* timeout request */
- REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
- /* must be punted even for NONBLOCK */
- REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
- /* no timeout sequence */
- REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
/* completion under lock */
REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
/* needs cleanup */
@@ -594,15 +591,15 @@ enum {
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
- /* needs to queue linked timeout */
- REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
+ /* req->task is refcounted */
+ REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
};
struct async_poll {
struct io_poll_iocb poll;
- struct io_wq_work work;
+ struct io_poll_iocb *double_poll;
};
/*
@@ -631,51 +628,54 @@ struct io_kiocb {
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ /* use only after cleaning per-op data, see io_clean_op() */
+ struct io_completion compl;
};
struct io_async_ctx *io;
- int cflags;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
u16 buf_index;
+ u32 result;
- struct io_ring_ctx *ctx;
- struct list_head list;
- unsigned int flags;
- refcount_t refs;
- struct task_struct *task;
- unsigned long fsize;
- u64 user_data;
- u32 result;
- u32 sequence;
-
- struct list_head link_list;
+ struct io_ring_ctx *ctx;
+ unsigned int flags;
+ refcount_t refs;
+ struct task_struct *task;
+ u64 user_data;
- struct list_head inflight_entry;
+ struct list_head link_list;
- struct percpu_ref *fixed_file_refs;
+ /*
+ * 1. used with ctx->iopoll_list with reads/writes
+ * 2. to track reqs with ->files (see io_op_def::file_table)
+ */
+ struct list_head inflight_entry;
+
+ struct percpu_ref *fixed_file_refs;
+ struct callback_head task_work;
+ /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+ struct hlist_node hash_node;
+ struct async_poll *apoll;
+ struct io_wq_work work;
+};
- union {
- /*
- * Only commands that never go async can use the below fields,
- * obviously. Right now only IORING_OP_POLL_ADD uses them, and
- * async armed poll handlers for regular commands. The latter
- * restore the work, if needed.
- */
- struct {
- struct callback_head task_work;
- struct hlist_node hash_node;
- struct async_poll *apoll;
- };
- struct io_wq_work work;
- };
+struct io_defer_entry {
+ struct list_head list;
+ struct io_kiocb *req;
+ u32 seq;
};
-#define IO_PLUG_THRESHOLD 2
#define IO_IOPOLL_BATCH 8
+struct io_comp_state {
+ unsigned int nr;
+ struct list_head list;
+ struct io_ring_ctx *ctx;
+};
+
struct io_submit_state {
struct blk_plug plug;
@@ -686,12 +686,16 @@ struct io_submit_state {
unsigned int free_reqs;
/*
+ * Batch completion logic
+ */
+ struct io_comp_state comp;
+
+ /*
* File reference cache
*/
struct file *file;
unsigned int fd;
unsigned int has_refs;
- unsigned int used_refs;
unsigned int ios_left;
};
@@ -719,6 +723,7 @@ struct io_op_def {
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
+ unsigned needs_fsize : 1;
};
static const struct io_op_def io_op_defs[] = {
@@ -738,6 +743,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
@@ -752,6 +758,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -804,6 +811,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
+ .needs_fsize = 1,
},
[IORING_OP_OPENAT] = {
.file_table = 1,
@@ -835,6 +843,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
@@ -877,21 +886,37 @@ static const struct io_op_def io_op_defs[] = {
},
};
-static void io_wq_submit_work(struct io_wq_work **workptr);
+enum io_mem_account {
+ ACCT_LOCKED,
+ ACCT_PINNED,
+};
+
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+ struct io_comp_state *cs);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
+static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
-static int io_grab_files(struct io_kiocb *req);
-static void io_cleanup_req(struct io_kiocb *req);
+static int io_prep_work_files(struct io_kiocb *req);
+static void __io_clean_op(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req,
- const struct io_uring_sqe *sqe);
+ const struct io_uring_sqe *sqe,
+ struct io_comp_state *cs);
+static void io_file_put_work(struct work_struct *work);
+
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter,
+ bool needs_lock);
+static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
+ struct iovec *iovec, struct iovec *fast_iov,
+ struct iov_iter *iter);
static struct kmem_cache *req_cachep;
@@ -910,7 +935,62 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
-static void io_file_put_work(struct work_struct *work);
+static void io_get_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ return;
+ get_task_struct(req->task);
+ req->flags |= REQ_F_TASK_PINNED;
+}
+
+static inline void io_clean_op(struct io_kiocb *req)
+{
+ if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+ __io_clean_op(req);
+}
+
+/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
+static void __io_put_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ put_task_struct(req->task);
+}
+
+static void io_sq_thread_drop_mm(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ kthread_unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
+static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
+{
+ if (!current->mm) {
+ if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
+ !mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ kthread_use_mm(ctx->sqo_mm);
+ }
+
+ return 0;
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ if (!io_op_defs[req->opcode].needs_mm)
+ return 0;
+ return __io_sq_thread_acquire_mm(ctx);
+}
+
+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+ if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+}
/*
* Note: must call io_req_init_async() for the first time you
@@ -937,6 +1017,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
complete(&ctx->ref_comp);
}
+static inline bool io_is_timeout_noseq(struct io_kiocb *req)
+{
+ return !req->timeout.off;
+}
+
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
@@ -980,7 +1065,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
- INIT_LIST_HEAD(&ctx->poll_list);
+ INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
init_waitqueue_head(&ctx->inflight_wait);
@@ -997,18 +1082,14 @@ err:
return NULL;
}
-static inline bool __req_need_defer(struct io_kiocb *req)
+static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
- struct io_ring_ctx *ctx = req->ctx;
+ if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
+ struct io_ring_ctx *ctx = req->ctx;
- return req->sequence != ctx->cached_cq_tail
+ return seq != ctx->cached_cq_tail
+ atomic_read(&ctx->cached_cq_overflow);
-}
-
-static inline bool req_need_defer(struct io_kiocb *req)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN))
- return __req_need_defer(req);
+ }
return false;
}
@@ -1026,30 +1107,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static inline void io_req_work_grab_env(struct io_kiocb *req,
- const struct io_op_def *def)
-{
- if (!req->work.mm && def->needs_mm) {
- mmgrab(current->mm);
- req->work.mm = current->mm;
- }
- if (!req->work.creds)
- req->work.creds = get_current_cred();
- if (!req->work.fs && def->needs_fs) {
- spin_lock(&current->fs->lock);
- if (!current->fs->in_exec) {
- req->work.fs = current->fs;
- req->work.fs->users++;
- } else {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- }
- spin_unlock(&current->fs->lock);
- }
- if (!req->work.task_pid)
- req->work.task_pid = task_pid_vnr(current);
-}
-
-static inline void io_req_work_drop_env(struct io_kiocb *req)
+static void io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
@@ -1071,14 +1129,17 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
spin_unlock(&req->work.fs->lock);
if (fs)
free_fs_struct(fs);
+ req->work.fs = NULL;
}
+ req->flags &= ~REQ_F_WORK_INITIALIZED;
}
-static inline void io_prep_async_work(struct io_kiocb *req,
- struct io_kiocb **link)
+static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
+ io_req_init_async(req);
+
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file)
io_wq_hash_work(&req->work, file_inode(req->file));
@@ -1086,18 +1147,42 @@ static inline void io_prep_async_work(struct io_kiocb *req,
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
+ if (!req->work.mm && def->needs_mm) {
+ mmgrab(current->mm);
+ req->work.mm = current->mm;
+ }
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
+ if (!req->work.fs && def->needs_fs) {
+ spin_lock(&current->fs->lock);
+ if (!current->fs->in_exec) {
+ req->work.fs = current->fs;
+ req->work.fs->users++;
+ } else {
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+ }
+ spin_unlock(&current->fs->lock);
+ }
+ if (def->needs_fsize)
+ req->work.fsize = rlimit(RLIMIT_FSIZE);
+ else
+ req->work.fsize = RLIM_INFINITY;
+}
- io_req_work_grab_env(req, def);
+static void io_prep_async_link(struct io_kiocb *req)
+{
+ struct io_kiocb *cur;
- *link = io_prep_linked_timeout(req);
+ io_prep_async_work(req);
+ if (req->flags & REQ_F_LINK_HEAD)
+ list_for_each_entry(cur, &req->link_list, link_list)
+ io_prep_async_work(cur);
}
-static inline void io_queue_async_work(struct io_kiocb *req)
+static void __io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *link;
-
- io_prep_async_work(req, &link);
+ struct io_kiocb *link = io_prep_linked_timeout(req);
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
@@ -1107,14 +1192,22 @@ static inline void io_queue_async_work(struct io_kiocb *req)
io_queue_linked_timeout(link);
}
+static void io_queue_async_work(struct io_kiocb *req)
+{
+ /* init ->work of the whole link before punting */
+ io_prep_async_link(req);
+ __io_queue_async_work(req);
+}
+
static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) {
- atomic_inc(&req->ctx->cq_timeouts);
- list_del_init(&req->list);
+ atomic_set(&req->ctx->cq_timeouts,
+ atomic_read(&req->ctx->cq_timeouts) + 1);
+ list_del_init(&req->timeout.list);
req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
io_put_req(req);
@@ -1126,7 +1219,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
struct io_kiocb *req, *tmp;
spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list)
io_kill_timeout(req);
spin_unlock_irq(&ctx->completion_lock);
}
@@ -1134,13 +1227,15 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
static void __io_queue_deferred(struct io_ring_ctx *ctx)
{
do {
- struct io_kiocb *req = list_first_entry(&ctx->defer_list,
- struct io_kiocb, list);
+ struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
+ struct io_defer_entry, list);
- if (req_need_defer(req))
+ if (req_need_defer(de->req, de->seq))
break;
- list_del_init(&req->list);
- io_queue_async_work(req);
+ list_del_init(&de->list);
+ /* punt-init is done before queueing for defer */
+ __io_queue_async_work(de->req);
+ kfree(de);
} while (!list_empty(&ctx->defer_list));
}
@@ -1148,15 +1243,15 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->timeout_list)) {
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
- struct io_kiocb, list);
+ struct io_kiocb, timeout.list);
- if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+ if (io_is_timeout_noseq(req))
break;
if (req->timeout.target_seq != ctx->cached_cq_tail
- atomic_read(&ctx->cq_timeouts))
break;
- list_del_init(&req->list);
+ list_del_init(&req->timeout.list);
io_kill_timeout(req);
}
}
@@ -1209,6 +1304,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
eventfd_signal(ctx->cq_ev_fd, 1);
}
+static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
+{
+ if (list_empty(&ctx->cq_overflow_list)) {
+ clear_bit(0, &ctx->sq_check_overflow);
+ clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ }
+}
+
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
@@ -1239,13 +1343,13 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
break;
req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
- list);
- list_move(&req->list, &list);
+ compl.list);
+ list_move(&req->compl.list, &list);
req->flags &= ~REQ_F_OVERFLOW;
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
- WRITE_ONCE(cqe->flags, req->cflags);
+ WRITE_ONCE(cqe->flags, req->compl.cflags);
} else {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1253,16 +1357,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
}
io_commit_cqring(ctx);
- if (cqe) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
- }
+ io_cqring_mark_overflow(ctx);
+
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
while (!list_empty(&list)) {
- req = list_first_entry(&list, struct io_kiocb, list);
- list_del(&req->list);
+ req = list_first_entry(&list, struct io_kiocb, compl.list);
+ list_del(&req->compl.list);
io_put_req(req);
}
@@ -1293,12 +1395,14 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
+ io_clean_op(req);
req->flags |= REQ_F_OVERFLOW;
- refcount_inc(&req->refs);
req->result = res;
- req->cflags = cflags;
- list_add_tail(&req->list, &ctx->cq_overflow_list);
+ req->compl.cflags = cflags;
+ refcount_inc(&req->refs);
+ list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
}
}
@@ -1307,7 +1411,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
__io_cqring_fill_event(req, res, 0);
}
-static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
+static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
@@ -1320,9 +1424,52 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
io_cqring_ev_posted(ctx);
}
-static void io_cqring_add_event(struct io_kiocb *req, long res)
+static void io_submit_flush_completions(struct io_comp_state *cs)
+{
+ struct io_ring_ctx *ctx = cs->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ while (!list_empty(&cs->list)) {
+ struct io_kiocb *req;
+
+ req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
+ list_del(&req->compl.list);
+ __io_cqring_fill_event(req, req->result, req->compl.cflags);
+ if (!(req->flags & REQ_F_LINK_HEAD)) {
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
+ } else {
+ spin_unlock_irq(&ctx->completion_lock);
+ io_put_req(req);
+ spin_lock_irq(&ctx->completion_lock);
+ }
+ }
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+ cs->nr = 0;
+}
+
+static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
+ struct io_comp_state *cs)
+{
+ if (!cs) {
+ io_cqring_add_event(req, res, cflags);
+ io_put_req(req);
+ } else {
+ io_clean_op(req);
+ req->result = res;
+ req->compl.cflags = cflags;
+ list_add_tail(&req->compl.list, &cs->list);
+ if (++cs->nr >= 32)
+ io_submit_flush_completions(cs);
+ }
+}
+
+static void io_req_complete(struct io_kiocb *req, long res)
{
- __io_cqring_add_event(req, res, 0);
+ __io_req_complete(req, res, 0, NULL);
}
static inline bool io_is_fallback_req(struct io_kiocb *req)
@@ -1348,11 +1495,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
- if (!state) {
- req = kmem_cache_alloc(req_cachep, gfp);
- if (unlikely(!req))
- goto fallback;
- } else if (!state->free_reqs) {
+ if (!state->free_reqs) {
size_t sz;
int ret;
@@ -1390,23 +1533,15 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
fput(file);
}
-static void __io_req_aux_free(struct io_kiocb *req)
+static void io_dismantle_req(struct io_kiocb *req)
{
- if (req->flags & REQ_F_NEED_CLEANUP)
- io_cleanup_req(req);
+ io_clean_op(req);
- kfree(req->io);
+ if (req->io)
+ kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- if (req->task)
- put_task_struct(req->task);
-
- io_req_work_drop_env(req);
-}
-
-static void __io_free_req(struct io_kiocb *req)
-{
- __io_req_aux_free(req);
+ io_req_clean_work(req);
if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
@@ -1418,57 +1553,20 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
-
- percpu_ref_put(&req->ctx->refs);
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
- else
- clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
}
-struct req_batch {
- void *reqs[IO_IOPOLL_BATCH];
- int to_free;
- int need_iter;
-};
-
-static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
+static void __io_free_req(struct io_kiocb *req)
{
- if (!rb->to_free)
- return;
- if (rb->need_iter) {
- int i, inflight = 0;
- unsigned long flags;
-
- for (i = 0; i < rb->to_free; i++) {
- struct io_kiocb *req = rb->reqs[i];
-
- if (req->flags & REQ_F_INFLIGHT)
- inflight++;
- __io_req_aux_free(req);
- }
- if (!inflight)
- goto do_free;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- for (i = 0; i < rb->to_free; i++) {
- struct io_kiocb *req = rb->reqs[i];
-
- if (req->flags & REQ_F_INFLIGHT) {
- list_del(&req->inflight_entry);
- if (!--inflight)
- break;
- }
- }
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ struct io_ring_ctx *ctx;
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
- }
-do_free:
- kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- percpu_ref_put_many(&ctx->refs, rb->to_free);
- rb->to_free = rb->need_iter = 0;
+ io_dismantle_req(req);
+ __io_put_req_task(req);
+ ctx = req->ctx;
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
+ percpu_ref_put(&ctx->refs);
}
static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1488,53 +1586,67 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
return false;
}
-static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+static bool __io_kill_linked_timeout(struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+ bool wake_ev;
+
+ if (list_empty(&req->link_list))
+ return false;
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ if (link->opcode != IORING_OP_LINK_TIMEOUT)
+ return false;
+
+ list_del_init(&link->link_list);
+ wake_ev = io_link_cancel_timeout(link);
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+ return wake_ev;
+}
+
+static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- bool wake_ev = false;
+ bool wake_ev;
- /* Already got next link */
- if (req->flags & REQ_F_LINK_NEXT)
- return;
+ if (!(req->flags & REQ_F_COMP_LOCKED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ wake_ev = __io_kill_linked_timeout(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ wake_ev = __io_kill_linked_timeout(req);
+ }
+
+ if (wake_ev)
+ io_cqring_ev_posted(ctx);
+}
+
+static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt;
/*
* The list should never be empty when we are called here. But could
* potentially happen if the chain is messed up, check to be on the
* safe side.
*/
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *nxt = list_first_entry(&req->link_list,
- struct io_kiocb, link_list);
-
- if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
- (nxt->flags & REQ_F_TIMEOUT))) {
- list_del_init(&nxt->link_list);
- wake_ev |= io_link_cancel_timeout(nxt);
- req->flags &= ~REQ_F_LINK_TIMEOUT;
- continue;
- }
-
- list_del_init(&req->link_list);
- if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK_HEAD;
- *nxtptr = nxt;
- break;
- }
+ if (unlikely(list_empty(&req->link_list)))
+ return NULL;
- req->flags |= REQ_F_LINK_NEXT;
- if (wake_ev)
- io_cqring_ev_posted(ctx);
+ nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ list_del_init(&req->link_list);
+ if (!list_empty(&nxt->link_list))
+ nxt->flags |= REQ_F_LINK_HEAD;
+ return nxt;
}
/*
* Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
-static void io_fail_links(struct io_kiocb *req)
+static void __io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) {
struct io_kiocb *link = list_first_entry(&req->link_list,
@@ -1543,25 +1655,37 @@ static void io_fail_links(struct io_kiocb *req)
list_del_init(&link->link_list);
trace_io_uring_fail_link(req, link);
- if ((req->flags & REQ_F_LINK_TIMEOUT) &&
- link->opcode == IORING_OP_LINK_TIMEOUT) {
- io_link_cancel_timeout(link);
- } else {
- io_cqring_fill_event(link, -ECANCELED);
- __io_double_put_req(link);
- }
+ io_cqring_fill_event(link, -ECANCELED);
+ __io_double_put_req(link);
req->flags &= ~REQ_F_LINK_TIMEOUT;
}
io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
-static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_fail_links(struct io_kiocb *req)
{
- if (likely(!(req->flags & REQ_F_LINK_HEAD)))
- return;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(req->flags & REQ_F_COMP_LOCKED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ __io_fail_links(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ __io_fail_links(req);
+ }
+
+ io_cqring_ev_posted(ctx);
+}
+
+static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+{
+ req->flags &= ~REQ_F_LINK_HEAD;
+ if (req->flags & REQ_F_LINK_TIMEOUT)
+ io_kill_linked_timeout(req);
/*
* If LINK is set, we have dependent requests in this chain. If we
@@ -1569,62 +1693,187 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & REQ_F_FAIL_LINK) {
- io_fail_links(req);
- } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
- REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
+ if (likely(!(req->flags & REQ_F_FAIL_LINK)))
+ return io_req_link_next(req);
+ io_fail_links(req);
+ return NULL;
+}
- /*
- * If this is a timeout link, we could be racing with the
- * timeout timer. Grab the completion lock for this case to
- * protect against that.
- */
- spin_lock_irqsave(&ctx->completion_lock, flags);
- io_req_link_next(req, nxt);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+{
+ if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ return NULL;
+ return __io_req_find_next(req);
+}
+
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
+{
+ struct task_struct *tsk = req->task;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, notify = TWA_RESUME;
+
+ /*
+ * SQPOLL kernel thread doesn't need notification, just a wakeup.
+ * If we're not using an eventfd, then TWA_RESUME is always fine,
+ * as we won't have dependencies between request completions for
+ * other kernel wait conditions.
+ */
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ notify = 0;
+ else if (ctx->cq_ev_fd)
+ notify = TWA_SIGNAL;
+
+ ret = task_work_add(tsk, cb, notify);
+ if (!ret)
+ wake_up_process(tsk);
+ return ret;
+}
+
+static void __io_req_task_cancel(struct io_kiocb *req, int error)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ io_cqring_fill_event(req, error);
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+ req_set_fail_links(req);
+ io_double_put_req(req);
+}
+
+static void io_req_task_cancel(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+ __io_req_task_cancel(req, -ECANCELED);
+}
+
+static void __io_req_task_submit(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!__io_sq_thread_acquire_mm(ctx)) {
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(req, NULL, NULL);
+ mutex_unlock(&ctx->uring_lock);
} else {
- io_req_link_next(req, nxt);
+ __io_req_task_cancel(req, -EFAULT);
}
}
-static void io_free_req(struct io_kiocb *req)
+static void io_req_task_submit(struct callback_head *cb)
{
- struct io_kiocb *nxt = NULL;
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- io_req_find_next(req, &nxt);
- __io_free_req(req);
+ __io_req_task_submit(req);
+}
+
+static void io_req_task_queue(struct io_kiocb *req)
+{
+ int ret;
+
+ init_task_work(&req->task_work, io_req_task_submit);
+
+ ret = io_req_task_work_add(req, &req->task_work);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ init_task_work(&req->task_work, io_req_task_cancel);
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
+ }
+}
+
+static void io_queue_next(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt = io_req_find_next(req);
if (nxt)
- io_queue_async_work(nxt);
+ io_req_task_queue(nxt);
}
-static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
+static void io_free_req(struct io_kiocb *req)
{
- struct io_kiocb *link;
- const struct io_op_def *def = &io_op_defs[nxt->opcode];
+ io_queue_next(req);
+ __io_free_req(req);
+}
- if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
- io_wq_hash_work(&nxt->work, file_inode(nxt->file));
+struct req_batch {
+ void *reqs[IO_IOPOLL_BATCH];
+ int to_free;
- *workptr = &nxt->work;
- link = io_prep_linked_timeout(nxt);
- if (link)
- nxt->flags |= REQ_F_QUEUE_TIMEOUT;
+ struct task_struct *task;
+ int task_refs;
+};
+
+static inline void io_init_req_batch(struct req_batch *rb)
+{
+ rb->to_free = 0;
+ rb->task_refs = 0;
+ rb->task = NULL;
+}
+
+static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
+ struct req_batch *rb)
+{
+ kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+ percpu_ref_put_many(&ctx->refs, rb->to_free);
+ rb->to_free = 0;
+}
+
+static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
+ struct req_batch *rb)
+{
+ if (rb->to_free)
+ __io_req_free_batch_flush(ctx, rb);
+ if (rb->task) {
+ put_task_struct_many(rb->task, rb->task_refs);
+ rb->task = NULL;
+ }
+}
+
+static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
+{
+ if (unlikely(io_is_fallback_req(req))) {
+ io_free_req(req);
+ return;
+ }
+ if (req->flags & REQ_F_LINK_HEAD)
+ io_queue_next(req);
+
+ if (req->flags & REQ_F_TASK_PINNED) {
+ if (req->task != rb->task) {
+ if (rb->task)
+ put_task_struct_many(rb->task, rb->task_refs);
+ rb->task = req->task;
+ rb->task_refs = 0;
+ }
+ rb->task_refs++;
+ req->flags &= ~REQ_F_TASK_PINNED;
+ }
+
+ io_dismantle_req(req);
+ rb->reqs[rb->to_free++] = req;
+ if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
+ __io_req_free_batch_flush(req->ctx, rb);
}
/*
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
*/
-__attribute__((nonnull))
-static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
{
+ struct io_kiocb *nxt = NULL;
+
if (refcount_dec_and_test(&req->refs)) {
- io_req_find_next(req, nxtptr);
+ nxt = io_req_find_next(req);
__io_free_req(req);
}
+ return nxt;
}
static void io_put_req(struct io_kiocb *req)
@@ -1633,24 +1882,20 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static void io_steal_work(struct io_kiocb *req,
- struct io_wq_work **workptr)
+static struct io_wq_work *io_steal_work(struct io_kiocb *req)
{
+ struct io_kiocb *nxt;
+
/*
- * It's in an io-wq worker, so there always should be at least
- * one reference, which will be dropped in io_put_work() just
- * after the current handler returns.
- *
- * It also means, that if the counter dropped to 1, then there is
- * no asynchronous users left, so it's safe to steal the next work.
+ * A ref is owned by io-wq in which context we're. So, if that's the
+ * last one, it's safe to steal next work. False negatives are Ok,
+ * it just will be re-punted async in io_put_work()
*/
- if (refcount_read(&req->refs) == 1) {
- struct io_kiocb *nxt = NULL;
+ if (refcount_read(&req->refs) != 1)
+ return NULL;
- io_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
- }
+ nxt = io_req_find_next(req);
+ return nxt ? &nxt->work : NULL;
}
/*
@@ -1700,31 +1945,45 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}
-static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
+static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
{
- if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
- return false;
+ unsigned int cflags;
- if (req->file || req->io)
- rb->need_iter++;
-
- rb->reqs[rb->to_free++] = req;
- if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
- io_free_req_many(req->ctx, rb);
- return true;
+ cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
+ cflags |= IORING_CQE_F_BUFFER;
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ kfree(kbuf);
+ return cflags;
}
-static int io_put_kbuf(struct io_kiocb *req)
+static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
struct io_buffer *kbuf;
- int cflags;
kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- cflags |= IORING_CQE_F_BUFFER;
- req->rw.addr = 0;
- kfree(kbuf);
- return cflags;
+ return io_put_kbuf(req, kbuf);
+}
+
+static inline bool io_run_task_work(void)
+{
+ if (current->task_works) {
+ __set_current_state(TASK_RUNNING);
+ task_work_run();
+ return true;
+ }
+
+ return false;
+}
+
+static void io_iopoll_queue(struct list_head *again)
+{
+ struct io_kiocb *req;
+
+ do {
+ req = list_first_entry(again, struct io_kiocb, inflight_entry);
+ list_del(&req->inflight_entry);
+ __io_complete_rw(req, -EAGAIN, 0, NULL);
+ } while (!list_empty(again));
}
/*
@@ -1735,41 +1994,40 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct req_batch rb;
struct io_kiocb *req;
+ LIST_HEAD(again);
+
+ /* order with ->result store in io_complete_rw_iopoll() */
+ smp_rmb();
- rb.to_free = rb.need_iter = 0;
+ io_init_req_batch(&rb);
while (!list_empty(done)) {
int cflags = 0;
- req = list_first_entry(done, struct io_kiocb, list);
- list_del(&req->list);
+ req = list_first_entry(done, struct io_kiocb, inflight_entry);
+ if (READ_ONCE(req->result) == -EAGAIN) {
+ req->iopoll_completed = 0;
+ list_move_tail(&req->inflight_entry, &again);
+ continue;
+ }
+ list_del(&req->inflight_entry);
if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_kbuf(req);
+ cflags = io_put_rw_kbuf(req);
__io_cqring_fill_event(req, req->result, cflags);
(*nr_events)++;
- if (refcount_dec_and_test(&req->refs) &&
- !io_req_multi_free(&rb, req))
- io_free_req(req);
+ if (refcount_dec_and_test(&req->refs))
+ io_req_free_batch(&rb, req);
}
io_commit_cqring(ctx);
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_ev_posted(ctx);
- io_free_req_many(ctx, &rb);
-}
-
-static void io_iopoll_queue(struct list_head *again)
-{
- struct io_kiocb *req;
+ io_req_free_batch_finish(ctx, &rb);
- do {
- req = list_first_entry(again, struct io_kiocb, list);
- list_del(&req->list);
- refcount_inc(&req->refs);
- io_queue_async_work(req);
- } while (!list_empty(again));
+ if (!list_empty(&again))
+ io_iopoll_queue(&again);
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
@@ -1777,7 +2035,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
- LIST_HEAD(again);
bool spin;
int ret;
@@ -1788,7 +2045,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
spin = !ctx->poll_multi_file && *nr_events < min;
ret = 0;
- list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
+ list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
struct kiocb *kiocb = &req->rw.kiocb;
/*
@@ -1797,23 +2054,20 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* and complete those lists first, if we have entries there.
*/
if (READ_ONCE(req->iopoll_completed)) {
- list_move_tail(&req->list, &done);
+ list_move_tail(&req->inflight_entry, &done);
continue;
}
if (!list_empty(&done))
break;
- if (req->result == -EAGAIN) {
- list_move_tail(&req->list, &again);
- continue;
- }
- if (!list_empty(&again))
- break;
-
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
if (ret < 0)
break;
+ /* iopoll may have completed current req */
+ if (READ_ONCE(req->iopoll_completed))
+ list_move_tail(&req->inflight_entry, &done);
+
if (ret && spin)
spin = false;
ret = 0;
@@ -1822,9 +2076,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
- if (!list_empty(&again))
- io_iopoll_queue(&again);
-
return ret;
}
@@ -1836,13 +2087,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
long min)
{
- while (!list_empty(&ctx->poll_list) && !need_resched()) {
+ while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
int ret;
ret = io_do_iopoll(ctx, nr_events, min);
if (ret < 0)
return ret;
- if (!min || *nr_events >= min)
+ if (*nr_events >= min)
return 0;
}
@@ -1853,29 +2104,37 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
-static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
+static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_IOPOLL))
return;
mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->poll_list)) {
+ while (!list_empty(&ctx->iopoll_list)) {
unsigned int nr_events = 0;
- io_iopoll_getevents(ctx, &nr_events, 1);
+ io_do_iopoll(ctx, &nr_events, 0);
+ /* let it sleep and repeat later if can't complete a request */
+ if (nr_events == 0)
+ break;
/*
* Ensure we allow local-to-the-cpu processing to take place,
* in this case we need to ensure that we reap all events.
+ * Also let task_work, etc. to progress by releasing the mutex
*/
- cond_resched();
+ if (need_resched()) {
+ mutex_unlock(&ctx->uring_lock);
+ cond_resched();
+ mutex_lock(&ctx->uring_lock);
+ }
}
mutex_unlock(&ctx->uring_lock);
}
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
+static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
+ unsigned int nr_events = 0;
int iters = 0, ret = 0;
/*
@@ -1885,8 +2144,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
*/
mutex_lock(&ctx->uring_lock);
do {
- int tmin = 0;
-
/*
* Don't enter poll loop if we already have events pending.
* If we do, we can potentially be spinning for commands that
@@ -1907,17 +2164,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
*/
if (!(++iters & 7)) {
mutex_unlock(&ctx->uring_lock);
+ io_run_task_work();
mutex_lock(&ctx->uring_lock);
}
- if (*nr_events < min)
- tmin = min - *nr_events;
-
- ret = io_iopoll_getevents(ctx, nr_events, tmin);
+ ret = io_iopoll_getevents(ctx, &nr_events, min);
if (ret <= 0)
break;
ret = 0;
- } while (min && !*nr_events && !need_resched());
+ } while (min && !nr_events && !need_resched());
mutex_unlock(&ctx->uring_lock);
return ret;
@@ -1937,13 +2192,8 @@ static void kiocb_end_write(struct io_kiocb *req)
file_end_write(req->file);
}
-static inline void req_set_fail_links(struct io_kiocb *req)
-{
- if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
-}
-
-static void io_complete_rw_common(struct kiocb *kiocb, long res)
+static void io_complete_rw_common(struct kiocb *kiocb, long res,
+ struct io_comp_state *cs)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
int cflags = 0;
@@ -1954,16 +2204,96 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
if (res != req->result)
req_set_fail_links(req);
if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_kbuf(req);
- __io_cqring_add_event(req, res, cflags);
+ cflags = io_put_rw_kbuf(req);
+ __io_req_complete(req, res, cflags, cs);
+}
+
+#ifdef CONFIG_BLOCK
+static bool io_resubmit_prep(struct io_kiocb *req, int error)
+{
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ ssize_t ret = -ECANCELED;
+ struct iov_iter iter;
+ int rw;
+
+ if (error) {
+ ret = error;
+ goto end_req;
+ }
+
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
+ rw = READ;
+ break;
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
+ rw = WRITE;
+ break;
+ default:
+ printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
+ req->opcode);
+ goto end_req;
+ }
+
+ ret = io_import_iovec(rw, req, &iovec, &iter, false);
+ if (ret < 0)
+ goto end_req;
+ ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter);
+ if (!ret)
+ return true;
+ kfree(iovec);
+end_req:
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return false;
+}
+
+static void io_rw_resubmit(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
+ int err;
+
+ err = io_sq_thread_acquire_mm(ctx, req);
+
+ if (io_resubmit_prep(req, err)) {
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ }
+}
+#endif
+
+static bool io_rw_reissue(struct io_kiocb *req, long res)
+{
+#ifdef CONFIG_BLOCK
+ int ret;
+
+ if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
+ return false;
+
+ init_task_work(&req->task_work, io_rw_resubmit);
+ ret = io_req_task_work_add(req, &req->task_work);
+ if (!ret)
+ return true;
+#endif
+ return false;
+}
+
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+ struct io_comp_state *cs)
+{
+ if (!io_rw_reissue(req, res))
+ io_complete_rw_common(&req->rw.kiocb, res, cs);
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
- io_complete_rw_common(kiocb, res);
- io_put_req(req);
+ __io_complete_rw(req, res, res2, NULL);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -1973,11 +2303,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (res != req->result)
+ if (res != -EAGAIN && res != req->result)
req_set_fail_links(req);
- req->result = res;
- if (res != -EAGAIN)
- WRITE_ONCE(req->iopoll_completed, 1);
+
+ WRITE_ONCE(req->result, res);
+ /* order with io_poll_complete() checking ->result */
+ smp_wmb();
+ WRITE_ONCE(req->iopoll_completed, 1);
}
/*
@@ -1995,13 +2327,13 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* how we do polling eventually, not spinning if we're on potentially
* different devices.
*/
- if (list_empty(&ctx->poll_list)) {
+ if (list_empty(&ctx->iopoll_list)) {
ctx->poll_multi_file = false;
} else if (!ctx->poll_multi_file) {
struct io_kiocb *list_req;
- list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
- list);
+ list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
+ inflight_entry);
if (list_req->file != req->file)
ctx->poll_multi_file = true;
}
@@ -2011,9 +2343,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
- list_add(&req->list, &ctx->poll_list);
+ list_add(&req->inflight_entry, &ctx->iopoll_list);
else
- list_add_tail(&req->list, &ctx->poll_list);
+ list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
if ((ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sqo_wait))
@@ -2022,10 +2354,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
static void __io_state_file_put(struct io_submit_state *state)
{
- int diff = state->has_refs - state->used_refs;
-
- if (diff)
- fput_many(state->file, diff);
+ if (state->has_refs)
+ fput_many(state->file, state->has_refs);
state->file = NULL;
}
@@ -2047,7 +2377,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
if (state->file) {
if (state->fd == fd) {
- state->used_refs++;
+ state->has_refs--;
state->ios_left--;
return state->file;
}
@@ -2058,12 +2388,20 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
return NULL;
state->fd = fd;
- state->has_refs = state->ios_left;
- state->used_refs = 1;
state->ios_left--;
+ state->has_refs = state->ios_left;
return state->file;
}
+static bool io_bdev_nowait(struct block_device *bdev)
+{
+#ifdef CONFIG_BLOCK
+ return !bdev || queue_is_mq(bdev_get_queue(bdev));
+#else
+ return true;
+#endif
+}
+
/*
* If we tracked the file through the SCM inflight mechanism, we could support
* any file. For now, just ensure that anything potentially problematic is done
@@ -2073,10 +2411,19 @@ static bool io_file_supports_async(struct file *file, int rw)
{
umode_t mode = file_inode(file)->i_mode;
- if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
- return true;
- if (S_ISREG(mode) && file->f_op != &io_uring_fops)
+ if (S_ISBLK(mode)) {
+ if (io_bdev_nowait(file->f_inode->i_bdev))
+ return true;
+ return false;
+ }
+ if (S_ISCHR(mode) || S_ISSOCK(mode))
return true;
+ if (S_ISREG(mode)) {
+ if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
+ file->f_op != &io_uring_fops)
+ return true;
+ return false;
+ }
/* any ->read/write should understand O_NONBLOCK */
if (file->f_flags & O_NONBLOCK)
@@ -2127,6 +2474,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (kiocb->ki_flags & IOCB_NOWAIT)
req->flags |= REQ_F_NOWAIT;
+ if (kiocb->ki_flags & IOCB_DIRECT)
+ io_get_req_task(req);
+
if (force_nonblock)
kiocb->ki_flags |= IOCB_NOWAIT;
@@ -2137,8 +2487,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
- req->result = 0;
req->iopoll_completed = 0;
+ io_get_req_task(req);
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -2172,14 +2522,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
-static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
+static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
+ struct io_comp_state *cs)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
- io_complete_rw(kiocb, ret, 0);
+ __io_complete_rw(req, ret, 0, cs);
else
io_rw_done(kiocb, ret);
}
@@ -2435,10 +2786,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
if (req->io) {
struct io_async_rw *iorw = &req->io->rw;
- *iovec = iorw->iov;
- iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
- if (iorw->iov == iorw->fast_iov)
- *iovec = NULL;
+ iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size);
+ *iovec = NULL;
return iorw->size;
}
@@ -2523,15 +2872,17 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter)
{
- req->io->rw.nr_segs = iter->nr_segs;
- req->io->rw.size = io_size;
- req->io->rw.iov = iovec;
- if (!req->io->rw.iov) {
- req->io->rw.iov = req->io->rw.fast_iov;
- if (req->io->rw.iov != fast_iov)
- memcpy(req->io->rw.iov, fast_iov,
+ struct io_async_rw *rw = &req->io->rw;
+
+ rw->nr_segs = iter->nr_segs;
+ rw->size = io_size;
+ if (!iovec) {
+ rw->iov = rw->fast_iov;
+ if (rw->iov != fast_iov)
+ memcpy(rw->iov, fast_iov,
sizeof(struct iovec) * iter->nr_segs);
} else {
+ rw->iov = iovec;
req->flags |= REQ_F_NEED_CLEANUP;
}
}
@@ -2565,11 +2916,27 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
return 0;
}
+static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
+ bool force_nonblock)
+{
+ struct io_async_ctx *io = req->io;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ io->rw.iov = io->rw.fast_iov;
+ req->io = NULL;
+ ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock);
+ req->io = io;
+ if (unlikely(ret < 0))
+ return ret;
+
+ io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+ return 0;
+}
+
static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
bool force_nonblock)
{
- struct io_async_ctx *io;
- struct iov_iter iter;
ssize_t ret;
ret = io_prep_rw(req, sqe, force_nonblock);
@@ -2582,84 +2949,176 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
/* either don't need iovec imported or already have it */
if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
+ return io_rw_prep_async(req, READ, force_nonblock);
+}
+
+static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *arg)
+{
+ struct wait_page_queue *wpq;
+ struct io_kiocb *req = wait->private;
+ struct wait_page_key *key = arg;
+ int ret;
+
+ wpq = container_of(wait, struct wait_page_queue, wait);
+
+ if (!wake_page_match(wpq, key))
+ return 0;
- io = req->io;
- io->rw.iov = io->rw.fast_iov;
- req->io = NULL;
- ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
- req->io = io;
- if (ret < 0)
- return ret;
+ /* Stop waking things up if the page is locked again */
+ if (test_bit(key->bit_nr, &key->page->flags))
+ return -1;
- io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
- return 0;
+ list_del_init(&wait->entry);
+
+ init_task_work(&req->task_work, io_req_task_submit);
+ /* submit ref gets dropped, acquire a new one */
+ refcount_inc(&req->refs);
+ ret = io_req_task_work_add(req, &req->task_work);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ /* queue just for cancelation */
+ init_task_work(&req->task_work, io_req_task_cancel);
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
+ }
+ return 1;
+}
+
+static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb,
+ struct wait_page_queue *wait,
+ wait_queue_func_t func,
+ void *data)
+{
+ /* Can't support async wakeup with polled IO */
+ if (kiocb->ki_flags & IOCB_HIPRI)
+ return -EINVAL;
+ if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) {
+ wait->wait.func = func;
+ wait->wait.private = data;
+ wait->wait.flags = 0;
+ INIT_LIST_HEAD(&wait->wait.entry);
+ kiocb->ki_flags |= IOCB_WAITQ;
+ kiocb->ki_waitq = wait;
+ return 0;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+
+static bool io_rw_should_retry(struct io_kiocb *req)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+ int ret;
+
+ /* never retry for NOWAIT, we just complete with -EAGAIN */
+ if (req->flags & REQ_F_NOWAIT)
+ return false;
+
+ /* already tried, or we're doing O_DIRECT */
+ if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ))
+ return false;
+ /*
+ * just use poll if we can, and don't attempt if the fs doesn't
+ * support callback based unlocks
+ */
+ if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
+ return false;
+
+ /*
+ * If request type doesn't require req->io to defer in general,
+ * we need to allocate it here
+ */
+ if (!req->io && __io_alloc_async_ctx(req))
+ return false;
+
+ ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
+ io_async_buf_func, req);
+ if (!ret) {
+ io_get_req_task(req);
+ return true;
+ }
+
+ return false;
+}
+
+static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
+{
+ if (req->file->f_op->read_iter)
+ return call_read_iter(req->file, &req->rw.kiocb, iter);
+ return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
}
-static int io_read(struct io_kiocb *req, bool force_nonblock)
+static int io_read(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter iter;
size_t iov_count;
- ssize_t io_size, ret;
+ ssize_t io_size, ret, ret2;
+ unsigned long nr_segs;
ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
+ io_size = ret;
+ req->result = io_size;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
kiocb->ki_flags &= ~IOCB_NOWAIT;
- req->result = 0;
- io_size = ret;
- if (req->flags & REQ_F_LINK_HEAD)
- req->result = io_size;
-
- /*
- * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
- * we know to async punt it even if it was opened O_NONBLOCK
- */
+ /* If the file doesn't support async, just async punt */
if (force_nonblock && !io_file_supports_async(req->file, READ))
goto copy_iov;
iov_count = iov_iter_count(&iter);
+ nr_segs = iter.nr_segs;
ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
- if (!ret) {
- ssize_t ret2;
+ if (unlikely(ret))
+ goto out_free;
- if (req->file->f_op->read_iter)
- ret2 = call_read_iter(req->file, kiocb, &iter);
- else
- ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
+ ret2 = io_iter_do_read(req, &iter);
- /* Catch -EAGAIN return for forced non-blocking submission */
- if (!force_nonblock || ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2);
- } else {
+ /* Catch -EAGAIN return for forced non-blocking submission */
+ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
+ kiocb_done(kiocb, ret2, cs);
+ } else {
+ iter.count = iov_count;
+ iter.nr_segs = nr_segs;
copy_iov:
- ret = io_setup_async_rw(req, io_size, iovec,
- inline_vecs, &iter);
- if (ret)
+ ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
+ &iter);
+ if (ret)
+ goto out_free;
+ /* it's copied and will be cleaned with ->io */
+ iovec = NULL;
+ /* if we can retry, do so with the callbacks armed */
+ if (io_rw_should_retry(req)) {
+ ret2 = io_iter_do_read(req, &iter);
+ if (ret2 == -EIOCBQUEUED) {
goto out_free;
- /* any defer here is final, must blocking retry */
- if (!(req->flags & REQ_F_NOWAIT) &&
- !file_can_poll(req->file))
- req->flags |= REQ_F_MUST_PUNT;
- return -EAGAIN;
+ } else if (ret2 != -EAGAIN) {
+ kiocb_done(kiocb, ret2, cs);
+ goto out_free;
+ }
}
+ kiocb->ki_flags &= ~IOCB_WAITQ;
+ return -EAGAIN;
}
out_free:
- kfree(iovec);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (iovec)
+ kfree(iovec);
return ret;
}
static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
bool force_nonblock)
{
- struct io_async_ctx *io;
- struct iov_iter iter;
ssize_t ret;
ret = io_prep_rw(req, sqe, force_nonblock);
@@ -2669,49 +3128,33 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- req->fsize = rlimit(RLIMIT_FSIZE);
-
/* either don't need iovec imported or already have it */
if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
-
- io = req->io;
- io->rw.iov = io->rw.fast_iov;
- req->io = NULL;
- ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
- req->io = io;
- if (ret < 0)
- return ret;
-
- io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
- return 0;
+ return io_rw_prep_async(req, WRITE, force_nonblock);
}
-static int io_write(struct io_kiocb *req, bool force_nonblock)
+static int io_write(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter iter;
size_t iov_count;
- ssize_t ret, io_size;
+ ssize_t ret, ret2, io_size;
+ unsigned long nr_segs;
ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
+ io_size = ret;
+ req->result = io_size;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
- req->result = 0;
- io_size = ret;
- if (req->flags & REQ_F_LINK_HEAD)
- req->result = io_size;
-
- /*
- * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
- * we know to async punt it even if it was opened O_NONBLOCK
- */
+ /* If the file doesn't support async, just async punt */
if (force_nonblock && !io_file_supports_async(req->file, WRITE))
goto copy_iov;
@@ -2721,60 +3164,54 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
goto copy_iov;
iov_count = iov_iter_count(&iter);
+ nr_segs = iter.nr_segs;
ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
- if (!ret) {
- ssize_t ret2;
-
- /*
- * Open-code file_start_write here to grab freeze protection,
- * which will be released by another thread in
- * io_complete_rw(). Fool lockdep by telling it the lock got
- * released so that it doesn't complain about the held lock when
- * we return to userspace.
- */
- if (req->flags & REQ_F_ISREG) {
- __sb_start_write(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE, true);
- __sb_writers_release(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE);
- }
- kiocb->ki_flags |= IOCB_WRITE;
-
- if (!force_nonblock)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+ if (unlikely(ret))
+ goto out_free;
- if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, &iter);
- else
- ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+ /*
+ * Open-code file_start_write here to grab freeze protection,
+ * which will be released by another thread in
+ * io_complete_rw(). Fool lockdep by telling it the lock got
+ * released so that it doesn't complain about the held lock when
+ * we return to userspace.
+ */
+ if (req->flags & REQ_F_ISREG) {
+ __sb_start_write(file_inode(req->file)->i_sb,
+ SB_FREEZE_WRITE, true);
+ __sb_writers_release(file_inode(req->file)->i_sb,
+ SB_FREEZE_WRITE);
+ }
+ kiocb->ki_flags |= IOCB_WRITE;
- if (!force_nonblock)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ if (req->file->f_op->write_iter)
+ ret2 = call_write_iter(req->file, kiocb, &iter);
+ else
+ ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
- /*
- * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
- * retry them without IOCB_NOWAIT.
- */
- if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
- ret2 = -EAGAIN;
- if (!force_nonblock || ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2);
- } else {
+ /*
+ * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
+ * retry them without IOCB_NOWAIT.
+ */
+ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
+ ret2 = -EAGAIN;
+ if (!force_nonblock || ret2 != -EAGAIN) {
+ kiocb_done(kiocb, ret2, cs);
+ } else {
+ iter.count = iov_count;
+ iter.nr_segs = nr_segs;
copy_iov:
- ret = io_setup_async_rw(req, io_size, iovec,
- inline_vecs, &iter);
- if (ret)
- goto out_free;
- /* any defer here is final, must blocking retry */
- if (!(req->flags & REQ_F_NOWAIT) &&
- !file_can_poll(req->file))
- req->flags |= REQ_F_MUST_PUNT;
- return -EAGAIN;
- }
+ ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
+ &iter);
+ if (ret)
+ goto out_free;
+ /* it's copied and will be cleaned with ->io */
+ iovec = NULL;
+ return -EAGAIN;
}
out_free:
- req->flags &= ~REQ_F_NEED_CLEANUP;
- kfree(iovec);
+ if (iovec)
+ kfree(iovec);
return ret;
}
@@ -2839,10 +3276,9 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock)
io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
req->flags &= ~REQ_F_NEED_CLEANUP;
- io_cqring_add_event(req, ret);
if (ret != sp->len)
req_set_fail_links(req);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -2876,25 +3312,23 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
req->flags &= ~REQ_F_NEED_CLEANUP;
- io_cqring_add_event(req, ret);
if (ret != sp->len)
req_set_fail_links(req);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
-static int io_nop(struct io_kiocb *req)
+static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- io_cqring_add_event(req, 0);
- io_put_req(req);
+ __io_req_complete(req, 0, 0, cs);
return 0;
}
@@ -2933,8 +3367,7 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
req->sync.flags & IORING_FSYNC_DATASYNC);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -2949,7 +3382,6 @@ static int io_fallocate_prep(struct io_kiocb *req,
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
req->sync.mode = READ_ONCE(sqe->len);
- req->fsize = rlimit(RLIMIT_FSIZE);
return 0;
}
@@ -2960,15 +3392,11 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
/* fallocate always requiring blocking context */
if (force_nonblock)
return -EAGAIN;
-
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3064,8 +3492,7 @@ err:
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3119,7 +3546,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
return i;
}
-static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
+static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
@@ -3138,8 +3566,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
io_ring_submit_lock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -3197,7 +3624,8 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
return i ? i : -ENOMEM;
}
-static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
+static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
@@ -3226,8 +3654,7 @@ out:
io_ring_submit_unlock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -3258,7 +3685,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#endif
}
-static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
+static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
#if defined(CONFIG_EPOLL)
struct io_epoll *ie = &req->epoll;
@@ -3270,8 +3698,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
#else
return -EOPNOTSUPP;
@@ -3307,8 +3734,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock)
ret = do_madvise(ma->addr, ma->len, ma->advice);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
#else
return -EOPNOTSUPP;
@@ -3347,8 +3773,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3387,8 +3812,7 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3419,7 +3843,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_close(struct io_kiocb *req, bool force_nonblock)
+static int io_close(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_close *close = &req->close;
int ret;
@@ -3433,8 +3858,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
/* if the file has a flush method, be safe and punt to async */
if (close->put_file->f_op->flush && force_nonblock) {
+ /* was never set, but play safe */
+ req->flags &= ~REQ_F_NOWAIT;
/* avoid grabbing files - we don't need the files */
- req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
+ req->flags |= REQ_F_NO_FILE_TABLE;
return -EAGAIN;
}
@@ -3442,10 +3869,9 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
ret = filp_close(close->put_file, req->work.files);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
fput(close->put_file);
close->put_file = NULL;
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -3479,8 +3905,7 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3500,6 +3925,15 @@ static int io_setup_async_msg(struct io_kiocb *req,
return -EAGAIN;
}
+static int io_sendmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
+{
+ iomsg->iov = iomsg->fast_iov;
+ iomsg->msg.msg_name = &iomsg->addr;
+ return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
+ req->sr_msg.msg_flags, &iomsg->iov);
+}
+
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
@@ -3510,7 +3944,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
- sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
#ifdef CONFIG_COMPAT
@@ -3524,135 +3958,126 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
- io->msg.iov = io->msg.fast_iov;
- ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
- &io->msg.iov);
+ ret = io_sendmsg_copy_hdr(req, &io->msg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
}
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
- struct io_async_msghdr *kmsg = NULL;
+ struct io_async_msghdr iomsg, *kmsg;
struct socket *sock;
+ unsigned flags;
int ret;
sock = sock_from_file(req->file, &ret);
- if (sock) {
- struct io_async_ctx io;
- unsigned flags;
-
- if (req->io) {
- kmsg = &req->io->msg;
- kmsg->msg.msg_name = &req->io->msg.addr;
- /* if iov is set, it's allocated already */
- if (!kmsg->iov)
- kmsg->iov = kmsg->fast_iov;
- kmsg->msg.msg_iter.iov = kmsg->iov;
- } else {
- struct io_sr_msg *sr = &req->sr_msg;
-
- kmsg = &io.msg;
- kmsg->msg.msg_name = &io.msg.addr;
+ if (unlikely(!sock))
+ return ret;
- io.msg.iov = io.msg.fast_iov;
- ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
- sr->msg_flags, &io.msg.iov);
- if (ret)
- return ret;
- }
+ if (req->io) {
+ kmsg = &req->io->msg;
+ kmsg->msg.msg_name = &req->io->msg.addr;
+ /* if iov is set, it's allocated already */
+ if (!kmsg->iov)
+ kmsg->iov = kmsg->fast_iov;
+ kmsg->msg.msg_iter.iov = kmsg->iov;
+ } else {
+ ret = io_sendmsg_copy_hdr(req, &iomsg);
+ if (ret)
+ return ret;
+ kmsg = &iomsg;
+ }
- flags = req->sr_msg.msg_flags;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
- ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
- if (force_nonblock && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- }
+ ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
- if (kmsg && kmsg->iov != kmsg->fast_iov)
+ if (kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
-static int io_send(struct io_kiocb *req, bool force_nonblock)
+static int io_send(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct msghdr msg;
+ struct iovec iov;
struct socket *sock;
+ unsigned flags;
int ret;
sock = sock_from_file(req->file, &ret);
- if (sock) {
- struct io_sr_msg *sr = &req->sr_msg;
- struct msghdr msg;
- struct iovec iov;
- unsigned flags;
+ if (unlikely(!sock))
+ return ret;
- ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
- &msg.msg_iter);
- if (ret)
- return ret;
+ ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
+ if (unlikely(ret))
+ return ret;;
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
- flags = req->sr_msg.msg_flags;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
- msg.msg_flags = flags;
- ret = sock_sendmsg(sock, &msg);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- }
+ msg.msg_flags = flags;
+ ret = sock_sendmsg(sock, &msg);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
- io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
-static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
{
struct io_sr_msg *sr = &req->sr_msg;
struct iovec __user *uiov;
size_t iov_len;
int ret;
- ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
- &uiov, &iov_len);
+ ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
+ &iomsg->uaddr, &uiov, &iov_len);
if (ret)
return ret;
if (req->flags & REQ_F_BUFFER_SELECT) {
if (iov_len > 1)
return -EINVAL;
- if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
+ if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov)))
return -EFAULT;
- sr->len = io->msg.iov[0].iov_len;
- iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
+ sr->len = iomsg->iov[0].iov_len;
+ iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1,
sr->len);
- io->msg.iov = NULL;
+ iomsg->iov = NULL;
} else {
ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
- &io->msg.iov, &io->msg.msg.msg_iter);
+ &iomsg->iov, &iomsg->msg.msg_iter);
if (ret > 0)
ret = 0;
}
@@ -3662,7 +4087,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
#ifdef CONFIG_COMPAT
static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_ctx *io)
+ struct io_async_msghdr *iomsg)
{
struct compat_msghdr __user *msg_compat;
struct io_sr_msg *sr = &req->sr_msg;
@@ -3671,8 +4096,8 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
compat_size_t len;
int ret;
- msg_compat = (struct compat_msghdr __user *) sr->msg;
- ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
+ msg_compat = (struct compat_msghdr __user *) sr->umsg;
+ ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
&ptr, &len);
if (ret)
return ret;
@@ -3689,12 +4114,12 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
return -EFAULT;
if (clen < 0)
return -EINVAL;
- sr->len = io->msg.iov[0].iov_len;
- io->msg.iov = NULL;
+ sr->len = iomsg->iov[0].iov_len;
+ iomsg->iov = NULL;
} else {
ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
- &io->msg.iov,
- &io->msg.msg.msg_iter);
+ &iomsg->iov,
+ &iomsg->msg.msg_iter);
if (ret < 0)
return ret;
}
@@ -3703,39 +4128,40 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
}
#endif
-static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+static int io_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
{
- io->msg.iov = io->msg.fast_iov;
+ iomsg->msg.msg_name = &iomsg->addr;
+ iomsg->iov = iomsg->fast_iov;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
- return __io_compat_recvmsg_copy_hdr(req, io);
+ return __io_compat_recvmsg_copy_hdr(req, iomsg);
#endif
- return __io_recvmsg_copy_hdr(req, io);
+ return __io_recvmsg_copy_hdr(req, iomsg);
}
static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- int *cflags, bool needs_lock)
+ bool needs_lock)
{
struct io_sr_msg *sr = &req->sr_msg;
struct io_buffer *kbuf;
- if (!(req->flags & REQ_F_BUFFER_SELECT))
- return NULL;
-
kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
if (IS_ERR(kbuf))
return kbuf;
sr->kbuf = kbuf;
req->flags |= REQ_F_BUFFER_SELECTED;
-
- *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- *cflags |= IORING_CQE_F_BUFFER;
return kbuf;
}
+static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
+{
+ return io_put_kbuf(req, req->sr_msg.kbuf);
+}
+
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -3747,7 +4173,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
- sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->bgid = READ_ONCE(sqe->buf_group);
@@ -3762,127 +4188,123 @@ static int io_recvmsg_prep(struct io_kiocb *req,
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
- ret = io_recvmsg_copy_hdr(req, io);
+ ret = io_recvmsg_copy_hdr(req, &io->msg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
}
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
- struct io_async_msghdr *kmsg = NULL;
+ struct io_async_msghdr iomsg, *kmsg;
struct socket *sock;
+ struct io_buffer *kbuf;
+ unsigned flags;
int ret, cflags = 0;
sock = sock_from_file(req->file, &ret);
- if (sock) {
- struct io_buffer *kbuf;
- struct io_async_ctx io;
- unsigned flags;
-
- if (req->io) {
- kmsg = &req->io->msg;
- kmsg->msg.msg_name = &req->io->msg.addr;
- /* if iov is set, it's allocated already */
- if (!kmsg->iov)
- kmsg->iov = kmsg->fast_iov;
- kmsg->msg.msg_iter.iov = kmsg->iov;
- } else {
- kmsg = &io.msg;
- kmsg->msg.msg_name = &io.msg.addr;
+ if (unlikely(!sock))
+ return ret;
- ret = io_recvmsg_copy_hdr(req, &io);
- if (ret)
- return ret;
- }
+ if (req->io) {
+ kmsg = &req->io->msg;
+ kmsg->msg.msg_name = &req->io->msg.addr;
+ /* if iov is set, it's allocated already */
+ if (!kmsg->iov)
+ kmsg->iov = kmsg->fast_iov;
+ kmsg->msg.msg_iter.iov = kmsg->iov;
+ } else {
+ ret = io_recvmsg_copy_hdr(req, &iomsg);
+ if (ret)
+ return ret;
+ kmsg = &iomsg;
+ }
- kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
- if (IS_ERR(kbuf)) {
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ kbuf = io_recv_buffer_select(req, !force_nonblock);
+ if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- } else if (kbuf) {
- kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
- 1, req->sr_msg.len);
- }
+ kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+ iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
+ 1, req->sr_msg.len);
+ }
- flags = req->sr_msg.msg_flags;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
- ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
- kmsg->uaddr, flags);
- if (force_nonblock && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- }
+ ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
+ kmsg->uaddr, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
- if (kmsg && kmsg->iov != kmsg->fast_iov)
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_recv_kbuf(req);
+ if (kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- __io_cqring_add_event(req, ret, cflags);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ __io_req_complete(req, ret, cflags, cs);
return 0;
}
-static int io_recv(struct io_kiocb *req, bool force_nonblock)
+static int io_recv(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
- struct io_buffer *kbuf = NULL;
+ struct io_buffer *kbuf;
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct msghdr msg;
+ void __user *buf = sr->buf;
struct socket *sock;
+ struct iovec iov;
+ unsigned flags;
int ret, cflags = 0;
sock = sock_from_file(req->file, &ret);
- if (sock) {
- struct io_sr_msg *sr = &req->sr_msg;
- void __user *buf = sr->buf;
- struct msghdr msg;
- struct iovec iov;
- unsigned flags;
+ if (unlikely(!sock))
+ return ret;
- kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ kbuf = io_recv_buffer_select(req, !force_nonblock);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- else if (kbuf)
- buf = u64_to_user_ptr(kbuf->addr);
+ buf = u64_to_user_ptr(kbuf->addr);
+ }
- ret = import_single_range(READ, buf, sr->len, &iov,
- &msg.msg_iter);
- if (ret) {
- kfree(kbuf);
- return ret;
- }
+ ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
+ if (unlikely(ret))
+ goto out_free;
- req->flags |= REQ_F_NEED_CLEANUP;
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
- msg.msg_iocb = NULL;
- msg.msg_flags = 0;
-
- flags = req->sr_msg.msg_flags;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
-
- ret = sock_recvmsg(sock, &msg, flags);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- }
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iocb = NULL;
+ msg.msg_flags = 0;
- kfree(kbuf);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- __io_cqring_add_event(req, ret, cflags);
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+
+ ret = sock_recvmsg(sock, &msg, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+out_free:
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_recv_kbuf(req);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ __io_req_complete(req, ret, cflags, cs);
return 0;
}
@@ -3902,7 +4324,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_accept *accept = &req->accept;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -3921,8 +4344,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock)
ret = -EINTR;
req_set_fail_links(req);
}
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -3946,7 +4368,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
&io->connect.address);
}
-static int io_connect(struct io_kiocb *req, bool force_nonblock)
+static int io_connect(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_async_ctx __io, *io;
unsigned file_flags;
@@ -3982,8 +4405,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock)
out:
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
#else /* !CONFIG_NET */
@@ -3992,12 +4414,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
}
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
-static int io_send(struct io_kiocb *req, bool force_nonblock)
+static int io_send(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
@@ -4008,12 +4432,14 @@ static int io_recvmsg_prep(struct io_kiocb *req,
return -EOPNOTSUPP;
}
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
-static int io_recv(struct io_kiocb *req, bool force_nonblock)
+static int io_recv(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
@@ -4023,7 +4449,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
}
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
@@ -4033,7 +4460,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
}
-static int io_connect(struct io_kiocb *req, bool force_nonblock)
+static int io_connect(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
@@ -4048,7 +4476,6 @@ struct io_poll_table {
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
- struct task_struct *tsk;
int ret;
/* for instances that support it check for an event match first: */
@@ -4059,7 +4486,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
list_del_init(&poll->wait.entry);
- tsk = req->task;
req->result = mask;
init_task_work(&req->task_work, func);
/*
@@ -4068,13 +4494,15 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = task_work_add(tsk, &req->task_work, true);
+ ret = io_req_task_work_add(req, &req->task_work);
if (unlikely(ret)) {
+ struct task_struct *tsk;
+
WRITE_ONCE(poll->canceled, true);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, true);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
}
- wake_up_process(tsk);
return 1;
}
@@ -4098,9 +4526,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
return false;
}
-static void io_poll_remove_double(struct io_kiocb *req)
+static void io_poll_remove_double(struct io_kiocb *req, void *data)
{
- struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ struct io_poll_iocb *poll = data;
lockdep_assert_held(&req->ctx->completion_lock);
@@ -4120,7 +4548,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
- io_poll_remove_double(req);
+ io_poll_remove_double(req, req->io);
req->poll.done = true;
io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
@@ -4138,7 +4566,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
req->flags |= REQ_F_COMP_LOCKED;
- io_put_req_find_next(req, nxt);
+ *nxt = io_put_req_find_next(req);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
@@ -4150,34 +4578,29 @@ static void io_poll_task_func(struct callback_head *cb)
struct io_kiocb *nxt = NULL;
io_poll_task_handler(req, &nxt);
- if (nxt) {
- struct io_ring_ctx *ctx = nxt->ctx;
-
- mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(nxt, NULL);
- mutex_unlock(&ctx->uring_lock);
- }
+ if (nxt)
+ __io_req_task_submit(nxt);
}
static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ struct io_poll_iocb *poll = req->apoll->double_poll;
__poll_t mask = key_to_poll(key);
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
- if (req->poll.head) {
+ if (poll && poll->head) {
bool done;
- spin_lock(&req->poll.head->lock);
- done = list_empty(&req->poll.wait.entry);
+ spin_lock(&poll->head->lock);
+ done = list_empty(&poll->wait.entry);
if (!done)
- list_del_init(&req->poll.wait.entry);
- spin_unlock(&req->poll.head->lock);
+ list_del_init(&poll->wait.entry);
+ spin_unlock(&poll->head->lock);
if (!done)
__io_async_wake(req, poll, mask, io_poll_task_func);
}
@@ -4197,7 +4620,8 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
}
static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
- struct wait_queue_head *head)
+ struct wait_queue_head *head,
+ struct io_poll_iocb **poll_ptr)
{
struct io_kiocb *req = pt->req;
@@ -4208,7 +4632,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
*/
if (unlikely(poll->head)) {
/* already have a 2nd entry, fail a third attempt */
- if (req->io) {
+ if (*poll_ptr) {
pt->error = -EINVAL;
return;
}
@@ -4220,20 +4644,25 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
refcount_inc(&req->refs);
poll->wait.private = req;
- req->io = (void *) poll;
+ *poll_ptr = poll;
}
pt->error = 0;
poll->head = head;
- add_wait_queue(head, &poll->wait);
+
+ if (poll->events & EPOLLEXCLUSIVE)
+ add_wait_queue_exclusive(head, &poll->wait);
+ else
+ add_wait_queue(head, &poll->wait);
}
static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+ struct async_poll *apoll = pt->req->apoll;
- __io_queue_proc(&pt->req->apoll->poll, pt, head);
+ __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
static void io_async_task_func(struct callback_head *cb)
@@ -4241,7 +4670,6 @@ static void io_async_task_func(struct callback_head *cb)
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
- bool canceled = false;
trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
@@ -4251,33 +4679,19 @@ static void io_async_task_func(struct callback_head *cb)
}
/* If req is still hashed, it cannot have been canceled. Don't check. */
- if (hash_hashed(&req->hash_node)) {
+ if (hash_hashed(&req->hash_node))
hash_del(&req->hash_node);
- } else {
- canceled = READ_ONCE(apoll->poll.canceled);
- if (canceled) {
- io_cqring_fill_event(req, -ECANCELED);
- io_commit_cqring(ctx);
- }
- }
+ io_poll_remove_double(req, apoll->double_poll);
spin_unlock_irq(&ctx->completion_lock);
- /* restore ->work in case we need to retry again */
- if (req->flags & REQ_F_WORK_INITIALIZED)
- memcpy(&req->work, &apoll->work, sizeof(req->work));
- kfree(apoll);
+ if (!READ_ONCE(apoll->poll.canceled))
+ __io_req_task_submit(req);
+ else
+ __io_req_task_cancel(req, -ECANCELED);
- if (!canceled) {
- __set_current_state(TASK_RUNNING);
- mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(req, NULL);
- mutex_unlock(&ctx->uring_lock);
- } else {
- io_cqring_ev_posted(ctx);
- req_set_fail_links(req);
- io_double_put_req(req);
- }
+ kfree(apoll->double_poll);
+ kfree(apoll);
}
static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -4310,8 +4724,8 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
struct io_ring_ctx *ctx = req->ctx;
bool cancel = false;
- poll->file = req->file;
io_init_poll_iocb(poll, mask, wake_func);
+ poll->file = req->file;
poll->wait.private = req;
ipt->pt._key = mask;
@@ -4348,11 +4762,10 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask, ret;
- bool had_io;
if (!req->file || !file_can_poll(req->file))
return false;
- if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
+ if (req->flags & REQ_F_POLLED)
return false;
if (!def->pollin && !def->pollout)
return false;
@@ -4360,14 +4773,10 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return false;
+ apoll->double_poll = NULL;
req->flags |= REQ_F_POLLED;
- if (req->flags & REQ_F_WORK_INITIALIZED)
- memcpy(&apoll->work, &req->work, sizeof(req->work));
- had_io = req->io != NULL;
-
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4383,13 +4792,9 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
if (ret) {
- ipt.error = 0;
- /* only remove double add if we did it here */
- if (!had_io)
- io_poll_remove_double(req);
+ io_poll_remove_double(req, apoll->double_poll);
spin_unlock_irq(&ctx->completion_lock);
- if (req->flags & REQ_F_WORK_INITIALIZED)
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
return false;
}
@@ -4420,23 +4825,18 @@ static bool io_poll_remove_one(struct io_kiocb *req)
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
- io_poll_remove_double(req);
+ io_poll_remove_double(req, req->io);
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
struct async_poll *apoll = req->apoll;
+ io_poll_remove_double(req, apoll->double_poll);
+
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &apoll->poll);
if (do_complete) {
io_put_req(req);
- /*
- * restore ->work because we will call
- * io_req_work_drop_env below when dropping the
- * final reference.
- */
- if (req->flags & REQ_F_WORK_INITIALIZED)
- memcpy(&req->work, &apoll->work,
- sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
}
}
@@ -4516,10 +4916,9 @@ static int io_poll_remove(struct io_kiocb *req)
ret = io_poll_cancel(ctx, addr);
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -4537,13 +4936,13 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- __io_queue_proc(&pt->req->poll, pt, head);
+ __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io);
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll_iocb *poll = &req->poll;
- u16 events;
+ u32 events;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4552,11 +4951,14 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
if (!poll->file)
return -EBADF;
- events = READ_ONCE(sqe->poll_events);
- poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+ events = READ_ONCE(sqe->poll32_events);
+#ifdef __BIG_ENDIAN
+ events = swahw32(events);
+#endif
+ poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
+ (events & EPOLLEXCLUSIVE);
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
return 0;
}
@@ -4568,7 +4970,6 @@ static int io_poll_add(struct io_kiocb *req)
__poll_t mask;
INIT_HLIST_NODE(&req->hash_node);
- INIT_LIST_HEAD(&req->list);
ipt.pt._qproc = io_poll_queue_proc;
mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
@@ -4595,15 +4996,16 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
- atomic_inc(&ctx->cq_timeouts);
-
spin_lock_irqsave(&ctx->completion_lock, flags);
+ atomic_set(&req->ctx->cq_timeouts,
+ atomic_read(&req->ctx->cq_timeouts) + 1);
+
/*
* We could be racing with timeout deletion. If the list is empty,
* then timeout lookup already found it and will be handling it.
*/
- if (!list_empty(&req->list))
- list_del_init(&req->list);
+ if (!list_empty(&req->timeout.list))
+ list_del_init(&req->timeout.list);
io_cqring_fill_event(req, -ETIME);
io_commit_cqring(ctx);
@@ -4620,9 +5022,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
struct io_kiocb *req;
int ret = -ENOENT;
- list_for_each_entry(req, &ctx->timeout_list, list) {
+ list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
if (user_data == req->user_data) {
- list_del_init(&req->list);
+ list_del_init(&req->timeout.list);
ret = 0;
break;
}
@@ -4646,7 +5048,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->buf_index || sqe->len)
return -EINVAL;
req->timeout.addr = READ_ONCE(sqe->addr);
@@ -4702,7 +5106,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
data = &req->io->timeout;
data->req = req;
- req->flags |= REQ_F_TIMEOUT;
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
@@ -4730,8 +5133,7 @@ static int io_timeout(struct io_kiocb *req)
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
- if (!off) {
- req->flags |= REQ_F_TIMEOUT_NOSEQ;
+ if (io_is_timeout_noseq(req)) {
entry = ctx->timeout_list.prev;
goto add;
}
@@ -4744,16 +5146,17 @@ static int io_timeout(struct io_kiocb *req)
* the one we need first.
*/
list_for_each_prev(entry, &ctx->timeout_list) {
- struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
+ struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
+ timeout.list);
- if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
+ if (io_is_timeout_noseq(nxt))
continue;
/* nxt.seq is behind @tail, otherwise would've been completed */
if (off >= nxt->timeout.target_seq - tail)
break;
}
add:
- list_add(&req->list, entry);
+ list_add(&req->timeout.list, entry);
data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
spin_unlock_irq(&ctx->completion_lock);
@@ -4772,7 +5175,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
enum io_wq_cancel cancel_ret;
int ret = 0;
- cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
+ cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -4824,8 +5227,9 @@ static int io_async_cancel_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
- sqe->cancel_flags)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr);
@@ -4843,7 +5247,9 @@ static int io_async_cancel(struct io_kiocb *req)
static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->flags || sqe->ioprio || sqe->rw_flags)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
req->files_update.offset = READ_ONCE(sqe->off);
@@ -4854,7 +5260,8 @@ static int io_files_update_prep(struct io_kiocb *req,
return 0;
}
-static int io_files_update(struct io_kiocb *req, bool force_nonblock)
+static int io_files_update(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_files_update up;
@@ -4872,8 +5279,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock)
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -4885,15 +5291,11 @@ static int io_req_defer_prep(struct io_kiocb *req,
if (!sqe)
return 0;
- io_req_init_async(req);
-
- if (io_op_defs[req->opcode].file_table) {
- ret = io_grab_files(req);
- if (unlikely(ret))
- return ret;
- }
-
- io_req_work_grab_env(req, &io_op_defs[req->opcode]);
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
+ ret = io_prep_work_files(req);
+ if (unlikely(ret))
+ return ret;
switch (req->opcode) {
case IORING_OP_NOP:
@@ -4995,86 +5397,117 @@ static int io_req_defer_prep(struct io_kiocb *req,
return ret;
}
+static u32 io_get_sequence(struct io_kiocb *req)
+{
+ struct io_kiocb *pos;
+ struct io_ring_ctx *ctx = req->ctx;
+ u32 total_submitted, nr_reqs = 1;
+
+ if (req->flags & REQ_F_LINK_HEAD)
+ list_for_each_entry(pos, &req->link_list, link_list)
+ nr_reqs++;
+
+ total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
+ return total_submitted - nr_reqs;
+}
+
static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_defer_entry *de;
int ret;
+ u32 seq;
/* Still need defer if there is pending req in defer list. */
- if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
+ if (likely(list_empty_careful(&ctx->defer_list) &&
+ !(req->flags & REQ_F_IO_DRAIN)))
+ return 0;
+
+ seq = io_get_sequence(req);
+ /* Still a chance to pass the sequence check */
+ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
return 0;
if (!req->io) {
- if (io_alloc_async_ctx(req))
- return -EAGAIN;
ret = io_req_defer_prep(req, sqe);
- if (ret < 0)
+ if (ret)
return ret;
}
+ io_prep_async_link(req);
+ de = kmalloc(sizeof(*de), GFP_KERNEL);
+ if (!de)
+ return -ENOMEM;
spin_lock_irq(&ctx->completion_lock);
- if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
+ if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock);
- return 0;
+ kfree(de);
+ io_queue_async_work(req);
+ return -EIOCBQUEUED;
}
trace_io_uring_defer(ctx, req, req->user_data);
- list_add_tail(&req->list, &ctx->defer_list);
+ de->req = req;
+ de->seq = seq;
+ list_add_tail(&de->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
return -EIOCBQUEUED;
}
-static void io_cleanup_req(struct io_kiocb *req)
+static void __io_clean_op(struct io_kiocb *req)
{
struct io_async_ctx *io = req->io;
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- if (req->flags & REQ_F_BUFFER_SELECTED)
+ if (req->flags & REQ_F_BUFFER_SELECTED) {
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
kfree((void *)(unsigned long)req->rw.addr);
- /* fallthrough */
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- if (io->rw.iov != io->rw.fast_iov)
- kfree(io->rw.iov);
- break;
- case IORING_OP_RECVMSG:
- if (req->flags & REQ_F_BUFFER_SELECTED)
- kfree(req->sr_msg.kbuf);
- /* fallthrough */
- case IORING_OP_SENDMSG:
- if (io->msg.iov != io->msg.fast_iov)
- kfree(io->msg.iov);
- break;
- case IORING_OP_RECV:
- if (req->flags & REQ_F_BUFFER_SELECTED)
+ break;
+ case IORING_OP_RECVMSG:
+ case IORING_OP_RECV:
kfree(req->sr_msg.kbuf);
- break;
- case IORING_OP_OPENAT:
- case IORING_OP_OPENAT2:
- break;
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- io_put_file(req, req->splice.file_in,
- (req->splice.flags & SPLICE_F_FD_IN_FIXED));
- break;
+ break;
+ }
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ }
+
+ if (req->flags & REQ_F_NEED_CLEANUP) {
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
+ if (io->rw.iov != io->rw.fast_iov)
+ kfree(io->rw.iov);
+ break;
+ case IORING_OP_RECVMSG:
+ case IORING_OP_SENDMSG:
+ if (io->msg.iov != io->msg.fast_iov)
+ kfree(io->msg.iov);
+ break;
+ case IORING_OP_SPLICE:
+ case IORING_OP_TEE:
+ io_put_file(req, req->splice.file_in,
+ (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+ break;
+ }
+ req->flags &= ~REQ_F_NEED_CLEANUP;
}
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
}
static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+ bool force_nonblock, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
switch (req->opcode) {
case IORING_OP_NOP:
- ret = io_nop(req);
+ ret = io_nop(req, cs);
break;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
@@ -5084,7 +5517,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_read(req, force_nonblock);
+ ret = io_read(req, force_nonblock, cs);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
@@ -5094,7 +5527,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_write(req, force_nonblock);
+ ret = io_write(req, force_nonblock, cs);
break;
case IORING_OP_FSYNC:
if (sqe) {
@@ -5136,9 +5569,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_SENDMSG)
- ret = io_sendmsg(req, force_nonblock);
+ ret = io_sendmsg(req, force_nonblock, cs);
else
- ret = io_send(req, force_nonblock);
+ ret = io_send(req, force_nonblock, cs);
break;
case IORING_OP_RECVMSG:
case IORING_OP_RECV:
@@ -5148,9 +5581,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_RECVMSG)
- ret = io_recvmsg(req, force_nonblock);
+ ret = io_recvmsg(req, force_nonblock, cs);
else
- ret = io_recv(req, force_nonblock);
+ ret = io_recv(req, force_nonblock, cs);
break;
case IORING_OP_TIMEOUT:
if (sqe) {
@@ -5174,7 +5607,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_accept(req, force_nonblock);
+ ret = io_accept(req, force_nonblock, cs);
break;
case IORING_OP_CONNECT:
if (sqe) {
@@ -5182,7 +5615,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_connect(req, force_nonblock);
+ ret = io_connect(req, force_nonblock, cs);
break;
case IORING_OP_ASYNC_CANCEL:
if (sqe) {
@@ -5214,7 +5647,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_close(req, force_nonblock);
+ ret = io_close(req, force_nonblock, cs);
break;
case IORING_OP_FILES_UPDATE:
if (sqe) {
@@ -5222,7 +5655,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_files_update(req, force_nonblock);
+ ret = io_files_update(req, force_nonblock, cs);
break;
case IORING_OP_STATX:
if (sqe) {
@@ -5262,7 +5695,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_epoll_ctl(req, force_nonblock);
+ ret = io_epoll_ctl(req, force_nonblock, cs);
break;
case IORING_OP_SPLICE:
if (sqe) {
@@ -5278,7 +5711,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_provide_buffers(req, force_nonblock);
+ ret = io_provide_buffers(req, force_nonblock, cs);
break;
case IORING_OP_REMOVE_BUFFERS:
if (sqe) {
@@ -5286,7 +5719,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_remove_buffers(req, force_nonblock);
+ ret = io_remove_buffers(req, force_nonblock, cs);
break;
case IORING_OP_TEE:
if (sqe) {
@@ -5308,9 +5741,6 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
const bool in_async = io_wq_current_is_worker();
- if (req->result == -EAGAIN)
- return -EAGAIN;
-
/* workqueue context doesn't hold uring_lock, grab it now */
if (in_async)
mutex_lock(&ctx->uring_lock);
@@ -5324,25 +5754,15 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
-static void io_arm_async_linked_timeout(struct io_kiocb *req)
-{
- struct io_kiocb *link;
-
- /* link head's timeout is queued in io_queue_async_work() */
- if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
- return;
-
- link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- io_queue_linked_timeout(link);
-}
-
-static void io_wq_submit_work(struct io_wq_work **workptr)
+static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
{
- struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_kiocb *timeout;
int ret = 0;
- io_arm_async_linked_timeout(req);
+ timeout = io_prep_linked_timeout(req);
+ if (timeout)
+ io_queue_linked_timeout(timeout);
/* if NO_CANCEL is set, we must still run the work */
if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
@@ -5352,7 +5772,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
if (!ret) {
do {
- ret = io_issue_sqe(req, NULL, false);
+ ret = io_issue_sqe(req, NULL, false, NULL);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -5366,11 +5786,10 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
if (ret) {
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
}
- io_steal_work(req, workptr);
+ return io_steal_work(req);
}
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -5427,6 +5846,8 @@ static int io_grab_files(struct io_kiocb *req)
int ret = -EBADF;
struct io_ring_ctx *ctx = req->ctx;
+ io_req_init_async(req);
+
if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
return 0;
if (!ctx->ring_file)
@@ -5452,6 +5873,13 @@ static int io_grab_files(struct io_kiocb *req)
return ret;
}
+static inline int io_prep_work_files(struct io_kiocb *req)
+{
+ if (!io_op_defs[req->opcode].file_table)
+ return 0;
+ return io_grab_files(req);
+}
+
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
@@ -5484,8 +5912,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
io_put_req(prev);
} else {
- io_cqring_add_event(req, -ETIME);
- io_put_req(req);
+ io_req_complete(req, -ETIME);
}
return HRTIMER_NORESTART;
}
@@ -5518,8 +5945,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
if (!(req->flags & REQ_F_LINK_HEAD))
return NULL;
- /* for polled retry, if flag is set, we already went through here */
- if (req->flags & REQ_F_POLLED)
+ if (req->flags & REQ_F_LINK_TIMEOUT)
return NULL;
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
@@ -5531,7 +5957,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
return nxt;
}
-static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_comp_state *cs)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt;
@@ -5551,54 +5978,45 @@ again:
old_creds = override_creds(req->work.creds);
}
- ret = io_issue_sqe(req, sqe, true);
+ ret = io_issue_sqe(req, sqe, true, cs);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
- (req->flags & REQ_F_MUST_PUNT))) {
- if (io_arm_poll_handler(req)) {
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
- goto exit;
- }
+ if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+ if (!io_arm_poll_handler(req)) {
punt:
- io_req_init_async(req);
-
- if (io_op_defs[req->opcode].file_table) {
- ret = io_grab_files(req);
- if (ret)
+ ret = io_prep_work_files(req);
+ if (unlikely(ret))
goto err;
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req);
}
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
goto exit;
}
+ if (unlikely(ret)) {
err:
- nxt = NULL;
- /* drop submission reference */
- io_put_req_find_next(req, &nxt);
-
- if (linked_timeout) {
- if (!ret)
- io_queue_linked_timeout(linked_timeout);
- else
- io_put_req(linked_timeout);
- }
-
- /* and drop final reference, if we failed */
- if (ret) {
- io_cqring_add_event(req, ret);
+ /* un-prep timeout, so it'll be killed as any other linked */
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
req_set_fail_links(req);
io_put_req(req);
+ io_req_complete(req, ret);
+ goto exit;
}
+
+ /* drop submission reference */
+ nxt = io_put_req_find_next(req);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+
if (nxt) {
req = nxt;
@@ -5611,7 +6029,8 @@ exit:
revert_creds(old_creds);
}
-static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_comp_state *cs)
{
int ret;
@@ -5619,17 +6038,14 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (ret) {
if (ret != -EIOCBQUEUED) {
fail_req:
- io_cqring_add_event(req, ret);
req_set_fail_links(req);
- io_double_put_req(req);
+ io_put_req(req);
+ io_req_complete(req, ret);
}
} else if (req->flags & REQ_F_FORCE_ASYNC) {
if (!req->io) {
- ret = -EAGAIN;
- if (io_alloc_async_ctx(req))
- goto fail_req;
ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret < 0))
+ if (unlikely(ret))
goto fail_req;
}
@@ -5637,24 +6053,26 @@ fail_req:
* Never try inline submit of IOSQE_ASYNC is set, go straight
* to async execution.
*/
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
- __io_queue_sqe(req, sqe);
+ __io_queue_sqe(req, sqe, cs);
}
}
-static inline void io_queue_link_head(struct io_kiocb *req)
+static inline void io_queue_link_head(struct io_kiocb *req,
+ struct io_comp_state *cs)
{
if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
- io_cqring_add_event(req, -ECANCELED);
- io_double_put_req(req);
+ io_put_req(req);
+ io_req_complete(req, -ECANCELED);
} else
- io_queue_sqe(req, NULL);
+ io_queue_sqe(req, NULL, cs);
}
static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **link)
+ struct io_kiocb **link, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -5680,21 +6098,19 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
- if (io_alloc_async_ctx(req))
- return -EAGAIN;
-
ret = io_req_defer_prep(req, sqe);
- if (ret) {
+ if (unlikely(ret)) {
/* fail even hard links since we don't submit */
head->flags |= REQ_F_FAIL_LINK;
return ret;
}
trace_io_uring_link(ctx, req, head);
+ io_get_req_task(req);
list_add_tail(&req->link_list, &head->link_list);
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- io_queue_link_head(head);
+ io_queue_link_head(head, cs);
*link = NULL;
}
} else {
@@ -5706,15 +6122,12 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->flags |= REQ_F_LINK_HEAD;
INIT_LIST_HEAD(&req->link_list);
- if (io_alloc_async_ctx(req))
- return -EAGAIN;
-
ret = io_req_defer_prep(req, sqe);
- if (ret)
+ if (unlikely(ret))
req->flags |= REQ_F_FAIL_LINK;
*link = req;
} else {
- io_queue_sqe(req, sqe);
+ io_queue_sqe(req, sqe, cs);
}
}
@@ -5726,6 +6139,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
*/
static void io_submit_state_end(struct io_submit_state *state)
{
+ if (!list_empty(&state->comp.list))
+ io_submit_flush_completions(&state->comp);
blk_finish_plug(&state->plug);
io_state_file_put(state);
if (state->free_reqs)
@@ -5736,9 +6151,15 @@ static void io_submit_state_end(struct io_submit_state *state)
* Start submission side cache.
*/
static void io_submit_state_start(struct io_submit_state *state,
- unsigned int max_ios)
+ struct io_ring_ctx *ctx, unsigned int max_ios)
{
blk_start_plug(&state->plug);
+#ifdef CONFIG_BLOCK
+ state->plug.nowait = true;
+#endif
+ state->comp.nr = 0;
+ INIT_LIST_HEAD(&state->comp.list);
+ state->comp.ctx = ctx;
state->free_reqs = 0;
state->file = NULL;
state->ios_left = max_ios;
@@ -5803,12 +6224,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
unsigned int sqe_flags;
int id;
- /*
- * All io need record the previous position, if LINK vs DARIN,
- * it can be used to mark the position of the first IO in the
- * link list.
- */
- req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
req->opcode = READ_ONCE(sqe->opcode);
req->user_data = READ_ONCE(sqe->user_data);
req->io = NULL;
@@ -5817,17 +6232,14 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->flags = 0;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
- req->task = NULL;
+ req->task = current;
req->result = 0;
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (io_op_defs[req->opcode].needs_mm && !current->mm) {
- if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
- return -EFAULT;
- kthread_use_mm(ctx->sqo_mm);
- }
+ if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ return -EFAULT;
sqe_flags = READ_ONCE(sqe->flags);
/* enforce forwards compatibility on users */
@@ -5859,7 +6271,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
struct file *ring_file, int ring_fd)
{
- struct io_submit_state state, *statep = NULL;
+ struct io_submit_state state;
struct io_kiocb *link = NULL;
int i, submitted = 0;
@@ -5876,10 +6288,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
- if (nr > IO_PLUG_THRESHOLD) {
- io_submit_state_start(&state, nr);
- statep = &state;
- }
+ io_submit_state_start(&state, ctx, nr);
ctx->ring_fd = ring_fd;
ctx->ring_file = ring_file;
@@ -5894,28 +6303,28 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
io_consume_sqe(ctx);
break;
}
- req = io_alloc_req(ctx, statep);
+ req = io_alloc_req(ctx, &state);
if (unlikely(!req)) {
if (!submitted)
submitted = -EAGAIN;
break;
}
- err = io_init_req(ctx, req, sqe, statep);
+ err = io_init_req(ctx, req, sqe, &state);
io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
if (unlikely(err)) {
fail_req:
- io_cqring_add_event(req, err);
- io_double_put_req(req);
+ io_put_req(req);
+ io_req_complete(req, err);
break;
}
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, io_async_submit(ctx));
- err = io_submit_sqe(req, sqe, &link);
+ err = io_submit_sqe(req, sqe, &link, &state.comp);
if (err)
goto fail_req;
}
@@ -5926,9 +6335,8 @@ fail_req:
percpu_ref_put_many(&ctx->refs, nr - ref_used);
}
if (link)
- io_queue_link_head(link);
- if (statep)
- io_submit_state_end(&state);
+ io_queue_link_head(link, &state.comp);
+ io_submit_state_end(&state);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -5936,14 +6344,19 @@ fail_req:
return submitted;
}
-static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
- struct mm_struct *mm = current->mm;
+ /* Tell userspace we may need a wakeup call */
+ spin_lock_irq(&ctx->completion_lock);
+ ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
+}
- if (mm) {
- kthread_unuse_mm(mm);
- mmput(mm);
- }
+static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
+{
+ spin_lock_irq(&ctx->completion_lock);
+ ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
}
static int io_sq_thread(void *data)
@@ -5962,12 +6375,12 @@ static int io_sq_thread(void *data)
while (!kthread_should_park()) {
unsigned int to_submit;
- if (!list_empty(&ctx->poll_list)) {
+ if (!list_empty(&ctx->iopoll_list)) {
unsigned nr_events = 0;
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->poll_list))
- io_iopoll_getevents(ctx, &nr_events, 0);
+ if (!list_empty(&ctx->iopoll_list) && !need_resched())
+ io_do_iopoll(ctx, &nr_events, 0);
else
timeout = jiffies + ctx->sq_thread_idle;
mutex_unlock(&ctx->uring_lock);
@@ -5979,14 +6392,14 @@ static int io_sq_thread(void *data)
* If submit got -EBUSY, flag us as needing the application
* to enter the kernel to reap and flush events.
*/
- if (!to_submit || ret == -EBUSY) {
+ if (!to_submit || ret == -EBUSY || need_resched()) {
/*
* Drop cur_mm before scheduling, we can't hold it for
* long periods (or over schedule()). Do this before
* adding ourselves to the waitqueue, as the unuse/drop
* may sleep.
*/
- io_sq_thread_drop_mm(ctx);
+ io_sq_thread_drop_mm();
/*
* We're polling. If we're within the defined idle
@@ -5995,11 +6408,10 @@ static int io_sq_thread(void *data)
* more IO, we should wait for the application to
* reap events and wake us up.
*/
- if (!list_empty(&ctx->poll_list) ||
+ if (!list_empty(&ctx->iopoll_list) || need_resched() ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
- if (current->task_works)
- task_work_run();
+ io_run_task_work();
cond_resched();
continue;
}
@@ -6009,21 +6421,18 @@ static int io_sq_thread(void *data)
/*
* While doing polled IO, before going to sleep, we need
- * to check if there are new reqs added to poll_list, it
- * is because reqs may have been punted to io worker and
- * will be added to poll_list later, hence check the
- * poll_list again.
+ * to check if there are new reqs added to iopoll_list,
+ * it is because reqs may have been punted to io worker
+ * and will be added to iopoll_list later, hence check
+ * the iopoll_list again.
*/
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->poll_list)) {
+ !list_empty_careful(&ctx->iopoll_list)) {
finish_wait(&ctx->sqo_wait, &wait);
continue;
}
- /* Tell userspace we may need a wakeup call */
- ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
- /* make sure to read SQ tail after writing flags */
- smp_mb();
+ io_ring_set_wakeup_flag(ctx);
to_submit = io_sqring_entries(ctx);
if (!to_submit || ret == -EBUSY) {
@@ -6031,9 +6440,9 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
break;
}
- if (current->task_works) {
- task_work_run();
+ if (io_run_task_work()) {
finish_wait(&ctx->sqo_wait, &wait);
+ io_ring_clear_wakeup_flag(ctx);
continue;
}
if (signal_pending(current))
@@ -6041,13 +6450,13 @@ static int io_sq_thread(void *data)
schedule();
finish_wait(&ctx->sqo_wait, &wait);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ io_ring_clear_wakeup_flag(ctx);
ret = 0;
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ io_ring_clear_wakeup_flag(ctx);
}
mutex_lock(&ctx->uring_lock);
@@ -6057,10 +6466,9 @@ static int io_sq_thread(void *data)
timeout = jiffies + ctx->sq_thread_idle;
}
- if (current->task_works)
- task_work_run();
+ io_run_task_work();
- io_sq_thread_drop_mm(ctx);
+ io_sq_thread_drop_mm();
revert_creds(old_cred);
kthread_parkme();
@@ -6123,9 +6531,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
if (io_cqring_events(ctx, false) >= min_events)
return 0;
- if (!current->task_works)
+ if (!io_run_task_work())
break;
- task_work_run();
} while (1);
if (sig) {
@@ -6146,15 +6553,23 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- if (current->task_works)
- task_work_run();
- if (io_should_wake(&iowq, false))
- break;
- schedule();
+ /* make sure we run task_work before checking for signals */
+ if (io_run_task_work())
+ continue;
if (signal_pending(current)) {
+ if (current->jobctl & JOBCTL_TASK_WORK) {
+ spin_lock_irq(&current->sighand->siglock);
+ current->jobctl &= ~JOBCTL_TASK_WORK;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ continue;
+ }
ret = -EINTR;
break;
}
+ if (io_should_wake(&iowq, false))
+ break;
+ schedule();
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
@@ -6626,6 +7041,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
for (i = 0; i < nr_tables; i++)
kfree(ctx->file_data->table[i].files);
+ percpu_ref_exit(&ctx->file_data->refs);
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
@@ -6778,8 +7194,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
}
table->files[index] = file;
err = io_sqe_file_register(ctx, file, i);
- if (err)
+ if (err) {
+ fput(file);
break;
+ }
}
nr_args--;
done++;
@@ -6923,17 +7341,21 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
return 0;
err:
io_finish_async(ctx);
- mmdrop(ctx->sqo_mm);
- ctx->sqo_mm = NULL;
+ if (ctx->sqo_mm) {
+ mmdrop(ctx->sqo_mm);
+ ctx->sqo_mm = NULL;
+ }
return ret;
}
-static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
+static inline void __io_unaccount_mem(struct user_struct *user,
+ unsigned long nr_pages)
{
atomic_long_sub(nr_pages, &user->locked_vm);
}
-static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
+static inline int __io_account_mem(struct user_struct *user,
+ unsigned long nr_pages)
{
unsigned long page_limit, cur_pages, new_pages;
@@ -6951,6 +7373,41 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
return 0;
}
+static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
+ enum io_mem_account acct)
+{
+ if (ctx->limit_mem)
+ __io_unaccount_mem(ctx->user, nr_pages);
+
+ if (ctx->sqo_mm) {
+ if (acct == ACCT_LOCKED)
+ ctx->sqo_mm->locked_vm -= nr_pages;
+ else if (acct == ACCT_PINNED)
+ atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm);
+ }
+}
+
+static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
+ enum io_mem_account acct)
+{
+ int ret;
+
+ if (ctx->limit_mem) {
+ ret = __io_account_mem(ctx->user, nr_pages);
+ if (ret)
+ return ret;
+ }
+
+ if (ctx->sqo_mm) {
+ if (acct == ACCT_LOCKED)
+ ctx->sqo_mm->locked_vm += nr_pages;
+ else if (acct == ACCT_PINNED)
+ atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm);
+ }
+
+ return 0;
+}
+
static void io_mem_free(void *ptr)
{
struct page *page;
@@ -6987,6 +7444,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
return SIZE_MAX;
#endif
+ if (sq_offset)
+ *sq_offset = off;
+
sq_array_size = array_size(sizeof(u32), sq_entries);
if (sq_array_size == SIZE_MAX)
return SIZE_MAX;
@@ -6994,9 +7454,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
if (check_add_overflow(off, sq_array_size, &off))
return SIZE_MAX;
- if (sq_offset)
- *sq_offset = off;
-
return off;
}
@@ -7025,8 +7482,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
for (j = 0; j < imu->nr_bvecs; j++)
unpin_user_page(imu->bvec[j].bv_page);
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user, imu->nr_bvecs);
+ io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
kvfree(imu->bvec);
imu->nr_bvecs = 0;
}
@@ -7109,11 +7565,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
- if (ctx->account_mem) {
- ret = io_account_mem(ctx->user, nr_pages);
- if (ret)
- goto err;
- }
+ ret = io_account_mem(ctx, nr_pages, ACCT_PINNED);
+ if (ret)
+ goto err;
ret = 0;
if (!pages || nr_pages > got_pages) {
@@ -7126,8 +7580,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
GFP_KERNEL);
if (!pages || !vmas) {
ret = -ENOMEM;
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user, nr_pages);
+ io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
goto err;
}
got_pages = nr_pages;
@@ -7137,8 +7590,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
GFP_KERNEL);
ret = -ENOMEM;
if (!imu->bvec) {
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user, nr_pages);
+ io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
goto err;
}
@@ -7169,8 +7621,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
*/
if (pret > 0)
unpin_user_pages(pages, pret);
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user, nr_pages);
+ io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
kvfree(imu->bvec);
goto err;
}
@@ -7254,11 +7705,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
- if (ctx->sqo_mm)
+ io_sqe_buffer_unregister(ctx);
+ if (ctx->sqo_mm) {
mmdrop(ctx->sqo_mm);
+ ctx->sqo_mm = NULL;
+ }
- io_iopoll_reap_events(ctx);
- io_sqe_buffer_unregister(ctx);
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
@@ -7275,9 +7727,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_mem_free(ctx->sq_sqes);
percpu_ref_exit(&ctx->refs);
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user,
- ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
put_cred(ctx->creds);
kfree(ctx->cancel_hash);
@@ -7325,13 +7774,20 @@ static int io_remove_personalities(int id, void *p, void *data)
static void io_ring_exit_work(struct work_struct *work)
{
- struct io_ring_ctx *ctx;
+ struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
+ exit_work);
- ctx = container_of(work, struct io_ring_ctx, exit_work);
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true);
-
- wait_for_completion(&ctx->ref_comp);
+ /*
+ * If we're doing polled IO and end up having requests being
+ * submitted async (out-of-line), then completions can come in while
+ * we're waiting for refs to drop. We need to reap these manually,
+ * as nobody else will be looking for them.
+ */
+ do {
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+ io_iopoll_try_reap_events(ctx);
+ } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
}
@@ -7347,11 +7803,20 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
if (ctx->io_wq)
io_wq_cancel_all(ctx->io_wq);
- io_iopoll_reap_events(ctx);
/* if we failed setting up the ctx, we might not have any rings */
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
+ io_iopoll_try_reap_events(ctx);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
+
+ /*
+ * Do this upfront, so we won't have a grace period where the ring
+ * is closed but resources aren't reaped yet. This can cause
+ * spurious failure in setting up a new ring.
+ */
+ io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
+ ACCT_LOCKED);
+
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
queue_work(system_wq, &ctx->exit_work);
}
@@ -7365,9 +7830,22 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
+static bool io_wq_files_match(struct io_wq_work *work, void *data)
+{
+ struct files_struct *files = data;
+
+ return work->files == files;
+}
+
static void io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
+ if (list_empty_careful(&ctx->inflight_list))
+ return;
+
+ /* cancel all at once, should be faster than doing it one by one*/
+ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
+
while (!list_empty_careful(&ctx->inflight_list)) {
struct io_kiocb *cancel_req = NULL, *req;
DEFINE_WAIT(wait);
@@ -7393,16 +7871,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
if (cancel_req->flags & REQ_F_OVERFLOW) {
spin_lock_irq(&ctx->completion_lock);
- list_del(&cancel_req->list);
+ list_del(&cancel_req->compl.list);
cancel_req->flags &= ~REQ_F_OVERFLOW;
- if (list_empty(&ctx->cq_overflow_list)) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
- }
- spin_unlock_irq(&ctx->completion_lock);
+ io_cqring_mark_overflow(ctx);
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
/*
* Put inflight ref and overflow ref. If that's
@@ -7423,6 +7899,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
}
}
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct task_struct *task = data;
+
+ return req->task == task;
+}
+
static int io_uring_flush(struct file *file, void *data)
{
struct io_ring_ctx *ctx = file->private_data;
@@ -7433,7 +7917,7 @@ static int io_uring_flush(struct file *file, void *data)
* If the task is going away, cancel work it may have pending
*/
if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
return 0;
}
@@ -7517,8 +8001,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
int submitted = 0;
struct fd f;
- if (current->task_works)
- task_work_run();
+ io_run_task_work();
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
return -EINVAL;
@@ -7557,8 +8040,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
- unsigned nr_events = 0;
-
min_complete = min(min_complete, ctx->cq_entries);
/*
@@ -7569,7 +8050,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
if (ctx->flags & IORING_SETUP_IOPOLL &&
!(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_iopoll_check(ctx, &nr_events, min_complete);
+ ret = io_iopoll_check(ctx, min_complete);
} else {
ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
}
@@ -7774,7 +8255,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
{
struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
- bool account_mem;
+ bool limit_mem;
int ret;
if (!entries)
@@ -7813,10 +8294,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
}
user = get_uid(current_user());
- account_mem = !capable(CAP_IPC_LOCK);
+ limit_mem = !capable(CAP_IPC_LOCK);
- if (account_mem) {
- ret = io_account_mem(user,
+ if (limit_mem) {
+ ret = __io_account_mem(user,
ring_pages(p->sq_entries, p->cq_entries));
if (ret) {
free_uid(user);
@@ -7826,14 +8307,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx = io_ring_ctx_alloc(p);
if (!ctx) {
- if (account_mem)
- io_unaccount_mem(user, ring_pages(p->sq_entries,
+ if (limit_mem)
+ __io_unaccount_mem(user, ring_pages(p->sq_entries,
p->cq_entries));
free_uid(user);
return -ENOMEM;
}
ctx->compat = in_compat_syscall();
- ctx->account_mem = account_mem;
ctx->user = user;
ctx->creds = get_current_cred();
@@ -7865,12 +8345,22 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
+ IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
+ IORING_FEAT_POLL_32BITS;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
goto err;
}
+
+ /*
+ * Account memory _before_ installing the file descriptor. Once
+ * the descriptor is installed, it can get closed at any time.
+ */
+ io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
+ ACCT_LOCKED);
+ ctx->limit_mem = limit_mem;
+
/*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
@@ -8154,7 +8644,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
+ BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
+ BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d634561f871a..78f5c96c76f3 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -612,9 +612,6 @@ static bool rootdir_empty(struct super_block *sb, unsigned long block)
/*
* Initialize the superblock and read the root inode.
- *
- * Note: a check_disk_change() has been done immediately prior
- * to this call, so we don't need to check again.
*/
static int isofs_fill_super(struct super_block *s, void *data, int silent)
{
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index cac468f04820..402769881c32 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -152,8 +152,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
int found;
- unsigned long uninitialized_var(block);
- unsigned long uninitialized_var(offset);
+ unsigned long block;
+ unsigned long offset;
struct inode *inode;
struct page *page;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a49d0e670ddf..e4944436e733 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1140,6 +1140,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
init_waitqueue_head(&journal->j_wait_reserved);
+ mutex_init(&journal->j_abort_mutex);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -1402,7 +1403,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
printk(KERN_ERR "JBD2: Error %d detected when updating "
"journal superblock for %s.\n", ret,
journal->j_devname);
- jbd2_journal_abort(journal, ret);
+ if (!is_journal_aborted(journal))
+ jbd2_journal_abort(journal, ret);
}
return ret;
@@ -2154,6 +2156,13 @@ void jbd2_journal_abort(journal_t *journal, int errno)
transaction_t *transaction;
/*
+ * Lock the aborting procedure until everything is done, this avoid
+ * races between filesystem's error handling flow (e.g. ext4_abort()),
+ * ensure panic after the error info is written into journal's
+ * superblock.
+ */
+ mutex_lock(&journal->j_abort_mutex);
+ /*
* ESHUTDOWN always takes precedence because a file system check
* caused by any other journal abort error is not required after
* a shutdown triggered.
@@ -2167,6 +2176,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
journal->j_errno = errno;
jbd2_journal_update_sb_errno(journal);
}
+ mutex_unlock(&journal->j_abort_mutex);
return;
}
@@ -2188,10 +2198,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
* layer could realise that a filesystem check is needed.
*/
jbd2_journal_update_sb_errno(journal);
-
- write_lock(&journal->j_state_lock);
- journal->j_flags |= JBD2_REC_ERR;
- write_unlock(&journal->j_state_lock);
+ mutex_unlock(&journal->j_abort_mutex);
}
/**
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 83b8f06b4a64..7e9abdb89712 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -401,7 +401,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
{
size_t retlen;
int ret;
- uint32_t uninitialized_var(bad_offset);
+ uint32_t bad_offset;
switch (jffs2_block_check_erase(c, jeb, &bad_offset)) {
case -EAGAIN: goto refile;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 0637271f3770..8ff4d1a1e774 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -259,7 +259,7 @@ struct jffs2_full_dirent
uint32_t ino; /* == zero for unlink */
unsigned int nhash;
unsigned char type;
- unsigned char name[0];
+ unsigned char name[];
};
/*
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index 60207a2ae952..e4131cb1f1d4 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -61,7 +61,7 @@ struct jffs2_sum_dirent_flash
jint32_t ino; /* == zero for unlink */
uint8_t nsize; /* dirent name size */
uint8_t type; /* dirent type */
- uint8_t name[0]; /* dirent name */
+ uint8_t name[]; /* dirent name */
} __attribute__((packed));
struct jffs2_sum_xattr_flash
@@ -117,7 +117,7 @@ struct jffs2_sum_dirent_mem
jint32_t ino; /* == zero for unlink */
uint8_t nsize; /* dirent name size */
uint8_t type; /* dirent type */
- uint8_t name[0]; /* dirent name */
+ uint8_t name[]; /* dirent name */
} __attribute__((packed));
struct jffs2_sum_xattr_mem
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index eb8b9e233d73..2935d4c776ec 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -36,6 +36,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 66acea9d878b..bde787c354fc 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -6,6 +6,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
diff --git a/fs/locks.c b/fs/locks.c
index 7df0f9fa66f4..938fe325bc54 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1282,6 +1282,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
if (!new_fl)
goto out;
locks_copy_lock(new_fl, request);
+ locks_move_blocks(new_fl, request);
request = new_fl;
new_fl = NULL;
locks_insert_lock_ctx(request, &fl->fl_list);
diff --git a/fs/namespace.c b/fs/namespace.c
index f30ed401cc6d..4a0f600a3328 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2603,6 +2603,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
if (IS_ERR(fc))
return PTR_ERR(fc);
+ fc->oldapi = true;
err = parse_monolithic_mount_data(fc, data);
if (!err) {
down_write(&sb->s_umount);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7d399f72ebbb..de03e440b7ee 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -907,9 +907,8 @@ retry:
goto out_mds;
/* Use a direct mapping of ds_idx to pgio mirror_idx */
- if (WARN_ON_ONCE(pgio->pg_mirror_count !=
- FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
- goto out_mds;
+ if (pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))
+ goto out_eagain;
for (i = 0; i < pgio->pg_mirror_count; i++) {
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
@@ -931,7 +930,10 @@ retry:
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
pgio->pg_maxretrans = io_maxretrans;
return;
-
+out_eagain:
+ pnfs_generic_pg_cleanup(pgio);
+ pgio->pg_error = -EAGAIN;
+ return;
out_mds:
trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
0, NFS4_MAX_UINT64, IOMODE_RW,
@@ -941,6 +943,7 @@ out_mds:
pgio->pg_lseg = NULL;
pgio->pg_maxretrans = 0;
nfs_pageio_reset_write_mds(pgio);
+ pgio->pg_error = -EAGAIN;
}
static unsigned int
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index a3ab6e219061..873342308dc0 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -308,6 +308,7 @@ static int try_location(struct fs_context *fc,
if (IS_ERR(export_path))
return PTR_ERR(export_path);
+ kfree(ctx->nfs_server.export_path);
ctx->nfs_server.export_path = export_path;
source = kmalloc(len + 1 + ctx->nfs_server.export_path_len + 1,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e32717fd1169..8963062da57e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -414,7 +414,7 @@ static int nfs4_delay_interruptible(long *timeout)
{
might_sleep();
- freezable_schedule_timeout_interruptible(nfs4_update_delay(timeout));
+ freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout));
if (!signal_pending(current))
return 0;
return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS;
@@ -774,6 +774,14 @@ static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
slot->seq_nr_last_acked = seqnr;
}
+static void nfs4_probe_sequence(struct nfs_client *client, const struct cred *cred,
+ struct nfs4_slot *slot)
+{
+ struct rpc_task *task = _nfs41_proc_sequence(client, cred, slot, true);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+}
+
static int nfs41_sequence_process(struct rpc_task *task,
struct nfs4_sequence_res *res)
{
@@ -790,6 +798,7 @@ static int nfs41_sequence_process(struct rpc_task *task,
goto out;
session = slot->table->session;
+ clp = session->clp;
trace_nfs4_sequence_done(session, res);
@@ -804,7 +813,6 @@ static int nfs41_sequence_process(struct rpc_task *task,
nfs4_slot_sequence_acked(slot, slot->seq_nr);
/* Update the slot's sequence and clientid lease timer */
slot->seq_done = 1;
- clp = session->clp;
do_renew_lease(clp, res->sr_timestamp);
/* Check sequence flags */
nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
@@ -852,10 +860,18 @@ static int nfs41_sequence_process(struct rpc_task *task,
/*
* Were one or more calls using this slot interrupted?
* If the server never received the request, then our
- * transmitted slot sequence number may be too high.
+ * transmitted slot sequence number may be too high. However,
+ * if the server did receive the request then it might
+ * accidentally give us a reply with a mismatched operation.
+ * We can sort this out by sending a lone sequence operation
+ * to the server on the same slot.
*/
if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) {
slot->seq_nr--;
+ if (task->tk_msg.rpc_proc != &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE]) {
+ nfs4_probe_sequence(clp, task->tk_msg.rpc_cred, slot);
+ res->sr_slot = NULL;
+ }
goto retry_nowait;
}
/*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bb3d2c32664a..c9056316a0b3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -507,6 +507,17 @@ find_any_file(struct nfs4_file *f)
return ret;
}
+static struct nfsd_file *find_deleg_file(struct nfs4_file *f)
+{
+ struct nfsd_file *ret = NULL;
+
+ spin_lock(&f->fi_lock);
+ if (f->fi_deleg_file)
+ ret = nfsd_file_get(f->fi_deleg_file);
+ spin_unlock(&f->fi_lock);
+ return ret;
+}
+
static atomic_long_t num_delegations;
unsigned long max_delegations;
@@ -2444,6 +2455,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
oo = ols->st_stateowner;
nf = st->sc_file;
file = find_any_file(nf);
+ if (!file)
+ return 0;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2481,6 +2494,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
oo = ols->st_stateowner;
nf = st->sc_file;
file = find_any_file(nf);
+ if (!file)
+ return 0;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2513,7 +2528,9 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
ds = delegstateid(st);
nf = st->sc_file;
- file = nf->fi_deleg_file;
+ file = find_deleg_file(nf);
+ if (!file)
+ return 0;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2529,6 +2546,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_fname(s, file);
seq_printf(s, " }\n");
+ nfsd_file_put(file);
return 0;
}
@@ -7912,9 +7930,14 @@ nfs4_state_start_net(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int ret;
- ret = nfs4_state_create_net(net);
+ ret = get_nfsdfs(net);
if (ret)
return ret;
+ ret = nfs4_state_create_net(net);
+ if (ret) {
+ mntput(nn->nfsd_mnt);
+ return ret;
+ }
locks_start_grace(net, &nn->nfsd4_manager);
nfsd4_client_tracking_init(net);
if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
@@ -7984,6 +8007,7 @@ nfs4_state_shutdown_net(struct net *net)
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
+ mntput(nn->nfsd_mnt);
}
void
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b68e96681522..7ae236113040 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -351,7 +351,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
{
char *dname, *path;
- int uninitialized_var(maxsize);
+ int maxsize;
char *mesg = buf;
int len;
struct auth_domain *dom;
@@ -1335,6 +1335,7 @@ void nfsd_client_rmdir(struct dentry *dentry)
WARN_ON_ONCE(ret);
fsnotify_rmdir(dir, dentry);
d_delete(dentry);
+ dput(dentry);
inode_unlock(dir);
}
@@ -1424,6 +1425,18 @@ static struct file_system_type nfsd_fs_type = {
};
MODULE_ALIAS_FS("nfsd");
+int get_nfsdfs(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct vfsmount *mnt;
+
+ mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ nn->nfsd_mnt = mnt;
+ return 0;
+}
+
#ifdef CONFIG_PROC_FS
static int create_proc_exports_entry(void)
{
@@ -1451,7 +1464,6 @@ unsigned int nfsd_net_id;
static __net_init int nfsd_init_net(struct net *net)
{
int retval;
- struct vfsmount *mnt;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
retval = nfsd_export_init(net);
@@ -1478,16 +1490,8 @@ static __net_init int nfsd_init_net(struct net *net)
init_waitqueue_head(&nn->ntf_wq);
seqlock_init(&nn->boot_lock);
- mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
- if (IS_ERR(mnt)) {
- retval = PTR_ERR(mnt);
- goto out_mount_err;
- }
- nn->nfsd_mnt = mnt;
return 0;
-out_mount_err:
- nfsd_reply_cache_shutdown(nn);
out_drc_error:
nfsd_idmap_shutdown(net);
out_idmap_error:
@@ -1500,7 +1504,6 @@ static __net_exit void nfsd_exit_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- mntput(nn->nfsd_mnt);
nfsd_reply_cache_shutdown(nn);
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 36cdd81b6688..57c832d1b30f 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -90,6 +90,8 @@ void nfsd_destroy(struct net *net);
bool i_am_nfsd(void);
+int get_nfsdfs(struct net *);
+
struct nfsdfs_client {
struct kref cl_ref;
void (*cl_release)(struct kref *kref);
@@ -100,6 +102,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
struct nfsdfs_client *ncl, u32 id, const struct tree_descr *);
void nfsd_client_rmdir(struct dentry *dentry);
+
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
#ifdef CONFIG_NFSD_V2_ACL
extern const struct svc_version nfsd_acl_version2;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c3fbab1753ec..d22a056da477 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1226,6 +1226,9 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
iap->ia_mode = 0;
iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
+ if (!IS_POSIXACL(dirp))
+ iap->ia_mode &= ~current_umask();
+
err = 0;
host_err = 0;
switch (type) {
@@ -1458,6 +1461,9 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
+ if (!IS_POSIXACL(dirp))
+ iap->ia_mode &= ~current_umask();
+
host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
if (host_err < 0) {
fh_drop_write(fhp);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 3c4811469ae8..a87d4391e6b5 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -8,6 +8,7 @@
#include <linux/buffer_head.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include "dir.h"
#include "aops.h"
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2f834add165b..4c1b90442d6f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4707,7 +4707,7 @@ int ocfs2_insert_extent(handle_t *handle,
struct ocfs2_alloc_context *meta_ac)
{
int status;
- int uninitialized_var(free_records);
+ int free_records;
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_insert_type insert = {0, };
struct ocfs2_extent_rec rec;
@@ -7051,7 +7051,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
int need_free = 0;
u32 bit_off, num;
handle_t *handle;
- u64 uninitialized_var(block);
+ u64 block;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 5761060d2ba8..bdfba9db558a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -848,9 +848,9 @@ static int ocfs2_dx_dir_lookup(struct inode *inode,
u64 *ret_phys_blkno)
{
int ret = 0;
- unsigned int cend, uninitialized_var(clen);
- u32 uninitialized_var(cpos);
- u64 uninitialized_var(blkno);
+ unsigned int cend, clen;
+ u32 cpos;
+ u64 blkno;
u32 name_hash = hinfo->major_hash;
ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
@@ -894,7 +894,7 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
struct ocfs2_dir_lookup_result *res)
{
int ret, i, found;
- u64 uninitialized_var(phys);
+ u64 phys;
struct buffer_head *dx_leaf_bh = NULL;
struct ocfs2_dx_leaf *dx_leaf;
struct ocfs2_dx_entry *dx_entry = NULL;
@@ -4393,9 +4393,9 @@ out:
int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
{
int ret;
- unsigned int uninitialized_var(clen);
- u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
- u64 uninitialized_var(blkno);
+ unsigned int clen;
+ u32 major_hash = UINT_MAX, p_cpos, cpos;
+ u64 blkno;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct buffer_head *dx_root_bh = NULL;
struct ocfs2_dx_root_block *dx_root;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 152a0fc4e905..751bc4dc7466 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -689,6 +689,12 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb)
+{
+ ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ init_rwsem(&osb->nfs_sync_rwlock);
+}
+
void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
@@ -2855,6 +2861,11 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
+ if (ex)
+ down_write(&osb->nfs_sync_rwlock);
+ else
+ down_read(&osb->nfs_sync_rwlock);
+
if (ocfs2_mount_local(osb))
return 0;
@@ -2873,6 +2884,10 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
if (!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, lockres,
ex ? LKM_EXMODE : LKM_PRMODE);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
}
int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
@@ -3340,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
- ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_nfs_sync_lock_init(osb);
ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index a94852af5510..7b93e9c766f6 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -403,7 +403,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
{
int i, ret, tree_height, len;
struct ocfs2_dinode *di;
- struct ocfs2_extent_block *uninitialized_var(eb);
+ struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
struct buffer_head *eb_bh = NULL;
@@ -599,7 +599,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
unsigned int *extent_flags)
{
int ret;
- unsigned int uninitialized_var(hole_len), flags = 0;
+ unsigned int hole_len, flags = 0;
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5381020aaa9a..c46bf7f581a1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2498,7 +2498,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
struct buffer_head *new_di_bh = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
- u64 uninitialized_var(di_blkno), suballoc_loc;
+ u64 di_blkno, suballoc_loc;
u16 suballoc_bit;
status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ee5d98516212..2dd71d626196 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -395,6 +395,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
+ struct rw_semaphore nfs_sync_rwlock;
struct ocfs2_lock_res osb_trim_fs_lockres;
struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 0dd8c41bafd4..19137c6d087b 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -290,7 +290,7 @@
#define OCFS2_MAX_SLOTS 255
/* Slot map indicator for an empty slot */
-#define OCFS2_INVALID_SLOT -1
+#define OCFS2_INVALID_SLOT ((u16)-1)
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
@@ -326,8 +326,8 @@ struct ocfs2_system_inode_info {
enum {
BAD_BLOCK_SYSTEM_INODE = 0,
GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE
SLOT_MAP_SYSTEM_INODE,
-#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
HEARTBEAT_SYSTEM_INODE,
GLOBAL_BITMAP_SYSTEM_INODE,
USER_QUOTA_SYSTEM_INODE,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index cfb77f70c888..3b397fa9c9e8 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1063,7 +1063,7 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
struct buffer_head **ret_bh)
{
int ret = 0, i, found;
- u32 low_cpos, uninitialized_var(cpos_end);
+ u32 low_cpos, cpos_end;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec = NULL;
struct ocfs2_extent_block *eb = NULL;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4836becb7578..45745cc3408a 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2825,9 +2825,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
goto bail;
}
- inode_alloc_inode =
- ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
- suballoc_slot);
+ if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
+ else
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
if (!inode_alloc_inode) {
/* the error code could be inaccurate, but we are not able to
* get the correct one. */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 90c830e3758e..9ccd19d8f7b1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1211,7 +1211,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
struct ocfs2_xattr_value_root *xv;
size_t size;
int ret = -ENODATA, name_offset, name_len, i;
- int uninitialized_var(block_off);
+ int block_off;
xs->bucket = ocfs2_xattr_bucket_new(inode);
if (!xs->bucket) {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d7b5f09d298c..2c7b70ee1388 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -220,7 +220,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
struct buffer_head *bh;
sector_t next, offset;
int ret;
- u64 uninitialized_var(new_block);
+ u64 new_block;
u32 max_extents;
int extent_count;
struct omfs_extent *oe;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 79dd052c7dbf..d07fb92b7253 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -787,7 +787,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
struct path upperpath, datapath;
int err;
char *capability = NULL;
- ssize_t uninitialized_var(cap_size);
+ ssize_t cap_size;
ovl_path_upper(c->dentry, &upperpath);
if (WARN_ON(upperpath.dentry == NULL))
@@ -895,7 +895,7 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
return err;
}
-int ovl_copy_up_flags(struct dentry *dentry, int flags)
+static int ovl_copy_up_flags(struct dentry *dentry, int flags)
{
int err = 0;
const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 8f4286450f92..0e696f72cf65 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -476,7 +476,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
if (IS_ERR_OR_NULL(this))
return this;
- if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
+ if (ovl_dentry_real_at(this, layer->idx) != real) {
dput(this);
this = ERR_PTR(-EIO);
}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 01820e654a21..0d940e29d62b 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -33,13 +33,16 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
+/* No atime modificaton nor notify on underlying */
+#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
+
static struct file *ovl_open_realfile(const struct file *file,
struct inode *realinode)
{
struct inode *inode = file_inode(file);
struct file *realfile;
const struct cred *old_cred;
- int flags = file->f_flags | O_NOATIME | FMODE_NONOTIFY;
+ int flags = file->f_flags | OVL_OPEN_FLAGS;
int acc_mode = ACC_MODE(flags);
int err;
@@ -72,8 +75,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
struct inode *inode = file_inode(file);
int err;
- /* No atime modificaton on underlying */
- flags |= O_NOATIME | FMODE_NONOTIFY;
+ flags |= OVL_OPEN_FLAGS;
/* If some flag changed that cannot be changed then something's amiss */
if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK))
@@ -126,7 +128,7 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
}
/* Did the flags change since open? */
- if (unlikely((file->f_flags ^ real->file->f_flags) & ~O_NOATIME))
+ if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS))
return ovl_change_flags(real->file, file->f_flags);
return 0;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 3566282a9199..f7d4358db637 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -389,7 +389,7 @@ invalid:
}
static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
- struct ovl_path **stackp, unsigned int *ctrp)
+ struct ovl_path **stackp)
{
struct ovl_fh *fh = ovl_get_fh(upperdentry, OVL_XATTR_ORIGIN);
int err;
@@ -406,10 +406,6 @@ static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
return err;
}
- if (WARN_ON(*ctrp))
- return -EIO;
-
- *ctrp = 1;
return 0;
}
@@ -861,8 +857,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
if (upperdentry && !d.is_dir) {
- unsigned int origin_ctr = 0;
-
/*
* Lookup copy up origin by decoding origin file handle.
* We may get a disconnected dentry, which is fine,
@@ -873,8 +867,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* number - it's the same as if we held a reference
* to a dentry in lower layer that was moved under us.
*/
- err = ovl_check_origin(ofs, upperdentry, &origin_path,
- &origin_ctr);
+ err = ovl_check_origin(ofs, upperdentry, &origin_path);
if (err)
goto out_put_upper;
@@ -1073,6 +1066,10 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
upperredirect = NULL;
goto out_free_oe;
}
+ err = ovl_check_metacopy_xattr(upperdentry);
+ if (err < 0)
+ goto out_free_oe;
+ uppermetacopy = err;
}
if (upperdentry || ctr) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index b725c7f15ff4..29bc1ec699e7 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -483,7 +483,6 @@ void ovl_aio_request_cache_destroy(void);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_with_data(struct dentry *dentry);
-int ovl_copy_up_flags(struct dentry *dentry, int flags);
int ovl_maybe_copy_up(struct dentry *dentry, int flags);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 91476bc422f9..4b38141c2985 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -580,12 +580,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
}
}
- /* Workdir is useless in non-upper mount */
- if (!config->upperdir && config->workdir) {
- pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
- config->workdir);
- kfree(config->workdir);
- config->workdir = NULL;
+ /* Workdir/index are useless in non-upper mount */
+ if (!config->upperdir) {
+ if (config->workdir) {
+ pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+ config->workdir);
+ kfree(config->workdir);
+ config->workdir = NULL;
+ }
+ if (config->index && index_opt) {
+ pr_info("option \"index=on\" is useless in a non-upper mount, ignore\n");
+ index_opt = false;
+ }
+ config->index = false;
}
err = ovl_parse_redirect_mode(config, config->redirect_mode);
@@ -622,11 +629,13 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
/* Resolve nfs_export -> index dependency */
if (config->nfs_export && !config->index) {
- if (nfs_export_opt && index_opt) {
+ if (!config->upperdir && config->redirect_follow) {
+ pr_info("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n");
+ config->nfs_export = false;
+ } else if (nfs_export_opt && index_opt) {
pr_err("conflicting options: nfs_export=on,index=off\n");
return -EINVAL;
- }
- if (index_opt) {
+ } else if (index_opt) {
/*
* There was an explicit index=off that resulted
* in this conflict.
@@ -1352,8 +1361,15 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
goto out;
}
+ /* index dir will act also as workdir */
+ iput(ofs->workdir_trap);
+ ofs->workdir_trap = NULL;
+ dput(ofs->workdir);
+ ofs->workdir = NULL;
ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
if (ofs->indexdir) {
+ ofs->workdir = dget(ofs->indexdir);
+
err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap,
"indexdir");
if (err)
@@ -1396,6 +1412,18 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs))
return true;
+ /*
+ * We allow using single lower with null uuid for index and nfs_export
+ * for example to support those features with single lower squashfs.
+ * To avoid regressions in setups of overlay with re-formatted lower
+ * squashfs, do not allow decoding origin with lower null uuid unless
+ * user opted-in to one of the new features that require following the
+ * lower inode of non-dir upper.
+ */
+ if (!ofs->config.index && !ofs->config.metacopy && !ofs->config.xino &&
+ uuid_is_null(uuid))
+ return false;
+
for (i = 0; i < ofs->numfs; i++) {
/*
* We use uuid to associate an overlay lower file handle with a
@@ -1493,14 +1521,23 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
if (err < 0)
goto out;
+ /*
+ * Check if lower root conflicts with this overlay layers before
+ * checking if it is in-use as upperdir/workdir of "another"
+ * mount, because we do not bother to check in ovl_is_inuse() if
+ * the upperdir/workdir is in fact in-use by our
+ * upperdir/workdir.
+ */
err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir");
if (err)
goto out;
if (ovl_is_inuse(stack[i].dentry)) {
err = ovl_report_in_use(ofs, "lowerdir");
- if (err)
+ if (err) {
+ iput(trap);
goto out;
+ }
}
mnt = clone_private_mount(&stack[i]);
@@ -1575,10 +1612,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
if (!ofs->config.upperdir && numlower == 1) {
pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n");
return ERR_PTR(-EINVAL);
- } else if (!ofs->config.upperdir && ofs->config.nfs_export &&
- ofs->config.redirect_follow) {
- pr_warn("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n");
- ofs->config.nfs_export = false;
}
stack = kcalloc(numlower, sizeof(struct path), GFP_KERNEL);
@@ -1842,21 +1875,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!ovl_upper_mnt(ofs))
sb->s_flags |= SB_RDONLY;
- if (!(ovl_force_readonly(ofs)) && ofs->config.index) {
- /* index dir will act also as workdir */
- dput(ofs->workdir);
- ofs->workdir = NULL;
- iput(ofs->workdir_trap);
- ofs->workdir_trap = NULL;
-
+ if (!ovl_force_readonly(ofs) && ofs->config.index) {
err = ovl_get_indexdir(sb, ofs, oe, &upperpath);
if (err)
goto out_free_oe;
/* Force r/o mount with no index dir */
- if (ofs->indexdir)
- ofs->workdir = dget(ofs->indexdir);
- else
+ if (!ofs->indexdir)
sb->s_flags |= SB_RDONLY;
}
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 9955d75c0585..ad31ec4ad627 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -26,8 +26,9 @@ static int boot_config_proc_show(struct seq_file *m, void *v)
static int __init copy_xbc_key_value_list(char *dst, size_t size)
{
struct xbc_node *leaf, *vnode;
- const char *val;
char *key, *end = dst + size;
+ const char *val;
+ char q;
int ret = 0;
key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
@@ -41,16 +42,20 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
break;
dst += ret;
vnode = xbc_node_get_child(leaf);
- if (vnode && xbc_node_is_array(vnode)) {
+ if (vnode) {
xbc_array_for_each_value(vnode, val) {
- ret = snprintf(dst, rest(dst, end), "\"%s\"%s",
- val, vnode->next ? ", " : "\n");
+ if (strchr(val, '"'))
+ q = '\'';
+ else
+ q = '"';
+ ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
+ q, val, q, vnode->next ? ", " : "\n");
if (ret < 0)
goto out;
dst += ret;
}
} else {
- ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val);
+ ret = snprintf(dst, rest(dst, end), "\"\"\n");
if (ret < 0)
break;
dst += ret;
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 37d38697eaf8..837971e74109 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/blkdev.h>
static int devinfo_show(struct seq_file *f, void *v)
{
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 8ba492d44e68..e502414b3556 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -512,7 +512,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
* Using bounce buffer to bypass the
* hardened user copy kernel text checks.
*/
- if (probe_kernel_read(buf, (void *) start, tsz)) {
+ if (copy_from_kernel_nofault(buf, (void *)start,
+ tsz)) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 42c5128c7d1c..6c1166ccdaea 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -566,8 +566,9 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
goto out;
/* don't even try if the size is too large */
- if (count > KMALLOC_MAX_SIZE)
- return -ENOMEM;
+ error = -ENOMEM;
+ if (count >= KMALLOC_MAX_SIZE)
+ goto out;
if (write) {
kbuf = memdup_user_nul(ubuf, count);
@@ -576,7 +577,6 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
goto out;
}
} else {
- error = -ENOMEM;
kbuf = kzalloc(count, GFP_KERNEL);
if (!kbuf)
goto out;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index a9e297eefdff..36714df37d5d 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -269,6 +269,9 @@ static int pstore_compress(const void *in, void *out,
{
int ret;
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION))
+ return -EINVAL;
+
ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
if (ret) {
pr_err("crypto_comp_compress failed, ret = %d!\n", ret);
@@ -668,7 +671,7 @@ static void decompress_record(struct pstore_record *record)
int unzipped_len;
char *unzipped, *workspace;
- if (!record->compressed)
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION) || !record->compressed)
return;
/* Only PSTORE_TYPE_DMESG support compression. */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 7b4bac91146b..bb02989d92b6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -78,6 +78,7 @@
#include <linux/namei.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include "../internal.h" /* ugh */
#include <linux/uaccess.h>
diff --git a/fs/read_write.c b/fs/read_write.c
index bbfa9b12b15e..4fb797822567 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -419,28 +419,42 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
return ret;
}
-ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
- loff_t *pos)
+ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
+ mm_segment_t old_fs = get_fs();
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
+ return -EINVAL;
+ if (!(file->f_mode & FMODE_CAN_READ))
+ return -EINVAL;
+
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
+ set_fs(KERNEL_DS);
if (file->f_op->read)
- return file->f_op->read(file, buf, count, pos);
+ ret = file->f_op->read(file, (void __user *)buf, count, pos);
else if (file->f_op->read_iter)
- return new_sync_read(file, buf, count, pos);
+ ret = new_sync_read(file, (void __user *)buf, count, pos);
else
- return -EINVAL;
+ ret = -EINVAL;
+ set_fs(old_fs);
+ if (ret > 0) {
+ fsnotify_access(file);
+ add_rchar(current, ret);
+ }
+ inc_syscr(current);
+ return ret;
}
ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
- mm_segment_t old_fs;
- ssize_t result;
+ ssize_t ret;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- /* The cast to a user pointer is valid due to the set_fs() */
- result = vfs_read(file, (void __user *)buf, count, pos);
- set_fs(old_fs);
- return result;
+ ret = rw_verify_area(READ, file, pos, count);
+ if (ret)
+ return ret;
+ return __kernel_read(file, buf, count, pos);
}
EXPORT_SYMBOL(kernel_read);
@@ -456,17 +470,22 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
- if (!ret) {
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- ret = __vfs_read(file, buf, count, pos);
- if (ret > 0) {
- fsnotify_access(file);
- add_rchar(current, ret);
- }
- inc_syscr(current);
- }
+ if (ret)
+ return ret;
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
+ if (file->f_op->read)
+ ret = file->f_op->read(file, buf, count, pos);
+ else if (file->f_op->read_iter)
+ ret = new_sync_read(file, buf, count, pos);
+ else
+ ret = -EINVAL;
+ if (ret > 0) {
+ fsnotify_access(file);
+ add_rchar(current, ret);
+ }
+ inc_syscr(current);
return ret;
}
@@ -488,23 +507,15 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
return ret;
}
-static ssize_t __vfs_write(struct file *file, const char __user *p,
- size_t count, loff_t *pos)
-{
- if (file->f_op->write)
- return file->f_op->write(file, p, count, pos);
- else if (file->f_op->write_iter)
- return new_sync_write(file, p, count, pos);
- else
- return -EINVAL;
-}
-
+/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
mm_segment_t old_fs;
const char __user *p;
ssize_t ret;
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
+ return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
@@ -513,7 +524,12 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
p = (__force const char __user *)buf;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
- ret = __vfs_write(file, p, count, pos);
+ if (file->f_op->write)
+ ret = file->f_op->write(file, p, count, pos);
+ else if (file->f_op->write_iter)
+ ret = new_sync_write(file, p, count, pos);
+ else
+ ret = -EINVAL;
set_fs(old_fs);
if (ret > 0) {
fsnotify_modify(file);
@@ -522,21 +538,20 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
inc_syscw(current);
return ret;
}
-EXPORT_SYMBOL(__kernel_write);
ssize_t kernel_write(struct file *file, const void *buf, size_t count,
loff_t *pos)
{
- mm_segment_t old_fs;
- ssize_t res;
+ ssize_t ret;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- /* The cast to a user pointer is valid due to the set_fs() */
- res = vfs_write(file, (__force const char __user *)buf, count, pos);
- set_fs(old_fs);
+ ret = rw_verify_area(WRITE, file, pos, count);
+ if (ret)
+ return ret;
- return res;
+ file_start_write(file);
+ ret = __kernel_write(file, buf, count, pos);
+ file_end_write(file);
+ return ret;
}
EXPORT_SYMBOL(kernel_write);
@@ -552,19 +567,23 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
- if (!ret) {
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- file_start_write(file);
- ret = __vfs_write(file, buf, count, pos);
- if (ret > 0) {
- fsnotify_modify(file);
- add_wchar(current, ret);
- }
- inc_syscw(current);
- file_end_write(file);
+ if (ret)
+ return ret;
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
+ file_start_write(file);
+ if (file->f_op->write)
+ ret = file->f_op->write(file, buf, count, pos);
+ else if (file->f_op->write_iter)
+ ret = new_sync_write(file, buf, count, pos);
+ else
+ ret = -EINVAL;
+ if (ret > 0) {
+ fsnotify_modify(file);
+ add_wchar(current, ret);
}
-
+ inc_syscw(current);
+ file_end_write(file);
return ret;
}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index ff336513c254..155b82870333 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -15,6 +15,7 @@
#include "reiserfs.h"
#include <linux/init.h>
#include <linux/proc_fs.h>
+#include <linux/blkdev.h>
/*
* LOCKING:
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 64f61330564a..76bb1c846845 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -175,7 +175,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
/* Extract the length of the metadata block */
data = page_address(bvec->bv_page) + bvec->bv_offset;
length = data[offset];
- if (offset <= bvec->bv_len - 1) {
+ if (offset < bvec->bv_len - 1) {
length |= data[offset + 1] << 8;
} else {
if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) {
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 7187bd1a30ea..8d64edb80ebf 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -262,7 +262,7 @@ struct squashfs_dir_index {
__le32 index;
__le32 start_block;
__le32 size;
- unsigned char name[0];
+ unsigned char name[];
};
struct squashfs_base_inode {
@@ -327,7 +327,7 @@ struct squashfs_symlink_inode {
__le32 inode_number;
__le32 nlink;
__le32 symlink_size;
- char symlink[0];
+ char symlink[];
};
struct squashfs_reg_inode {
@@ -341,7 +341,7 @@ struct squashfs_reg_inode {
__le32 fragment;
__le32 offset;
__le32 file_size;
- __le16 block_list[0];
+ __le16 block_list[];
};
struct squashfs_lreg_inode {
@@ -358,7 +358,7 @@ struct squashfs_lreg_inode {
__le32 fragment;
__le32 offset;
__le32 xattr;
- __le16 block_list[0];
+ __le16 block_list[];
};
struct squashfs_dir_inode {
@@ -389,7 +389,7 @@ struct squashfs_ldir_inode {
__le16 i_count;
__le16 offset;
__le32 xattr;
- struct squashfs_dir_index index[0];
+ struct squashfs_dir_index index[];
};
union squashfs_inode {
@@ -410,7 +410,7 @@ struct squashfs_dir_entry {
__le16 inode_number;
__le16 type;
__le16 size;
- char name[0];
+ char name[];
};
struct squashfs_dir_header {
@@ -428,12 +428,12 @@ struct squashfs_fragment_entry {
struct squashfs_xattr_entry {
__le16 type;
__le16 size;
- char data[0];
+ char data[];
};
struct squashfs_xattr_val {
__le32 vsize;
- char value[0];
+ char value[];
};
struct squashfs_xattr_id {
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index ad292c5a43a9..b5cdac9b0368 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -552,11 +552,11 @@ out:
*/
int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
{
- int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
+ int lnum, offs, len, err = 0, last_level, child_cnt;
int first = 1, iip;
struct ubifs_debug_info *d = c->dbg;
- union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
- unsigned long long uninitialized_var(last_sqnum);
+ union ubifs_key lower_key, upper_key, l_key, u_key;
+ unsigned long long last_sqnum;
struct ubifs_idx_node *idx;
struct list_head list;
struct idx_node *i;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ef85ec167a84..9d042942d8b2 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1260,7 +1260,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct timespec64 time;
- unsigned int uninitialized_var(saved_nlink);
+ unsigned int saved_nlink;
struct fscrypt_name old_nm, new_nm;
/*
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 49fe062ce45e..b77d1637bbbc 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -222,7 +222,7 @@ static int write_begin_slow(struct address_space *mapping,
struct ubifs_info *c = inode->i_sb->s_fs_info;
pgoff_t index = pos >> PAGE_SHIFT;
struct ubifs_budget_req req = { .new_page = 1 };
- int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+ int err, appending = !!(pos + len > inode->i_size);
struct page *page;
dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
@@ -426,7 +426,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_inode *ui = ubifs_inode(inode);
pgoff_t index = pos >> PAGE_SHIFT;
- int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+ int err, appending = !!(pos + len > inode->i_size);
int skipped_read = 0;
struct page *page;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index e5ec1afe1c66..2e6264318bd9 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1222,7 +1222,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
- struct ubifs_inode *uninitialized_var(new_ui);
+ struct ubifs_inode *new_ui;
u8 hash_old_dir[UBIFS_HASH_ARR_SZ];
u8 hash_new_dir[UBIFS_HASH_ARR_SZ];
u8 hash_new_inode[UBIFS_HASH_ARR_SZ];
@@ -1507,7 +1507,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
union ubifs_key key, to_key;
struct ubifs_ino_node *ino;
struct ubifs_trun_node *trun;
- struct ubifs_data_node *uninitialized_var(dn);
+ struct ubifs_data_node *dn;
int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
struct ubifs_inode *ui = ubifs_inode(inode);
ino_t inum = inode->i_ino;
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index e21abf250951..6e0a153b7194 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -275,7 +275,7 @@ uint32_t ubifs_unpack_bits(const struct ubifs_info *c, uint8_t **addr, int *pos,
const int k = 32 - nrbits;
uint8_t *p = *addr;
int b = *pos;
- uint32_t uninitialized_var(val);
+ uint32_t val;
const int bytes = (nrbits + b + 7) >> 3;
ubifs_assert(c, nrbits > 0);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e8e7b0e9532e..f609f6cdde70 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -892,7 +892,7 @@ static int fallible_resolve_collision(struct ubifs_info *c,
int adding)
{
struct ubifs_znode *o_znode = NULL, *znode = *zn;
- int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
+ int o_n, err, cmp, unsure = 0, nn = *n;
cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
if (unlikely(cmp < 0))
@@ -1514,8 +1514,8 @@ out:
*/
int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
{
- int n, err = 0, lnum = -1, uninitialized_var(offs);
- int uninitialized_var(len);
+ int n, err = 0, lnum = -1, offs;
+ int len;
unsigned int block = key_block(c, &bu->key);
struct ubifs_znode *znode;
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index 49cb34c3f324..ccaf94ea5be3 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -126,8 +126,8 @@ int ubifs_search_zbranch(const struct ubifs_info *c,
const struct ubifs_znode *znode,
const union ubifs_key *key, int *n)
{
- int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
- int uninitialized_var(cmp);
+ int beg = 0, end = znode->child_cnt, mid;
+ int cmp;
const struct ubifs_zbranch *zbr = &znode->zbranch[0];
ubifs_assert(c, end > beg);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 02f03fadb75b..8e597db4d971 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -564,7 +564,7 @@ static udf_pblk_t udf_table_new_block(struct super_block *sb,
udf_pblk_t newblock = 0;
uint32_t adsize;
uint32_t elen, goal_elen = 0;
- struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+ struct kernel_lb_addr eloc, goal_eloc;
struct extent_position epos, goal_epos;
int8_t etype;
struct udf_inode_info *iinfo = UDF_I(table);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 52de29000c7e..6e264dded46e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -339,7 +339,6 @@ out:
return ret;
}
-/* Should pair with userfaultfd_signal_pending() */
static inline long userfaultfd_get_blocking_state(unsigned int flags)
{
if (flags & FAULT_FLAG_INTERRUPTIBLE)
@@ -351,18 +350,6 @@ static inline long userfaultfd_get_blocking_state(unsigned int flags)
return TASK_UNINTERRUPTIBLE;
}
-/* Should pair with userfaultfd_get_blocking_state() */
-static inline bool userfaultfd_signal_pending(unsigned int flags)
-{
- if (flags & FAULT_FLAG_INTERRUPTIBLE)
- return signal_pending(current);
-
- if (flags & FAULT_FLAG_KILLABLE)
- return fatal_signal_pending(current);
-
- return false;
-}
-
/*
* The locking rules involved in returning VM_FAULT_RETRY depending on
* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
@@ -516,33 +503,9 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
vmf->flags, reason);
mmap_read_unlock(mm);
- if (likely(must_wait && !READ_ONCE(ctx->released) &&
- !userfaultfd_signal_pending(vmf->flags))) {
+ if (likely(must_wait && !READ_ONCE(ctx->released))) {
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
schedule();
- ret |= VM_FAULT_MAJOR;
-
- /*
- * False wakeups can orginate even from rwsem before
- * up_read() however userfaults will wait either for a
- * targeted wakeup on the specific uwq waitqueue from
- * wake_userfault() or for signals or for uffd
- * release.
- */
- while (!READ_ONCE(uwq.waken)) {
- /*
- * This needs the full smp_store_mb()
- * guarantee as the state write must be
- * visible to other CPUs before reading
- * uwq.waken from other CPUs.
- */
- set_current_state(blocking_state);
- if (READ_ONCE(uwq.waken) ||
- READ_ONCE(ctx->released) ||
- userfaultfd_signal_pending(vmf->flags))
- break;
- schedule();
- }
}
__set_current_state(TASK_RUNNING);
diff --git a/fs/verity/open.c b/fs/verity/open.c
index d007db0c9304..bfe0280c14e4 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -221,11 +221,20 @@ out:
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
{
/*
- * Multiple processes may race to set ->i_verity_info, so use cmpxchg.
- * This pairs with the READ_ONCE() in fsverity_get_info().
+ * Multiple tasks may race to set ->i_verity_info, so use
+ * cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * fsverity_get_info(). I.e., here we publish ->i_verity_info with a
+ * RELEASE barrier so that other tasks can ACQUIRE it.
*/
- if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL)
+ if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) {
+ /* Lost the race, so free the fsverity_info we allocated. */
fsverity_free_info(vi);
+ /*
+ * Afterwards, the caller may access ->i_verity_info directly,
+ * so make sure to ACQUIRE the winning fsverity_info.
+ */
+ (void)fsverity_get_info(inode);
+ }
}
void fsverity_free_info(struct fsverity_info *vi)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index f37f5cc4b19f..30525861c596 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -126,7 +126,7 @@ xfs_bmap_rtalloc(
* pick an extent that will space things out in the rt area.
*/
if (ap->eof && ap->offset == 0) {
- xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+ xfs_rtblock_t rtx; /* realtime extent no */
error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
if (error)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 00db81eac80d..fdbff4860d61 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1080,7 +1080,7 @@ xfs_file_open(
return -EFBIG;
if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT;
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return 0;
}
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b43f0e8f43f2..9ed90368ab31 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -671,7 +671,8 @@ xlog_cil_push_work(
/*
* Wake up any background push waiters now this context is being pushed.
*/
- wake_up_all(&ctx->push_wait);
+ if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+ wake_up_all(&cil->xc_push_wait);
/*
* Check if we've anything to push. If there is nothing, then we don't
@@ -743,13 +744,12 @@ xlog_cil_push_work(
/*
* initialise the new context and attach it to the CIL. Then attach
- * the current context to the CIL committing lsit so it can be found
+ * the current context to the CIL committing list so it can be found
* during log forces to extract the commit lsn of the sequence that
* needs to be forced.
*/
INIT_LIST_HEAD(&new_ctx->committing);
INIT_LIST_HEAD(&new_ctx->busy_extents);
- init_waitqueue_head(&new_ctx->push_wait);
new_ctx->sequence = ctx->sequence + 1;
new_ctx->cil = cil;
cil->xc_ctx = new_ctx;
@@ -937,7 +937,7 @@ xlog_cil_push_background(
if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
ASSERT(cil->xc_ctx->space_used < log->l_logsize);
- xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
+ xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
return;
}
@@ -1216,12 +1216,12 @@ xlog_cil_init(
INIT_LIST_HEAD(&cil->xc_committing);
spin_lock_init(&cil->xc_cil_lock);
spin_lock_init(&cil->xc_push_lock);
+ init_waitqueue_head(&cil->xc_push_wait);
init_rwsem(&cil->xc_ctx_lock);
init_waitqueue_head(&cil->xc_commit_wait);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
- init_waitqueue_head(&ctx->push_wait);
ctx->sequence = 1;
ctx->cil = cil;
cil->xc_ctx = ctx;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ec22c7a3867f..75a62870b63a 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -240,7 +240,6 @@ struct xfs_cil_ctx {
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
- wait_queue_head_t push_wait; /* background push throttle */
struct work_struct discard_endio_work;
};
@@ -274,6 +273,7 @@ struct xfs_cil {
wait_queue_head_t xc_commit_wait;
xfs_lsn_t xc_current_sequence;
struct work_struct xc_push_work;
+ wait_queue_head_t xc_push_wait; /* background push throttle */
} ____cacheline_aligned_in_smp;
/*
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
index 4bcc3e61056c..b03333f1c84a 100644
--- a/fs/xfs/xfs_pwork.c
+++ b/fs/xfs/xfs_pwork.c
@@ -132,5 +132,5 @@ xfs_pwork_guess_datadev_parallelism(
* For now we'll go with the most conservative setting possible,
* which is two threads for an SSD and 1 thread everywhere else.
*/
- return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1;
+ return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1;
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 07bc42d62673..abfb17f88f9a 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -607,14 +607,14 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
int nr_pages;
ssize_t ret;
- nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
- if (!nr_pages)
- return 0;
-
max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
iov_iter_truncate(from, max);
+ nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
+ if (!nr_pages)
+ return 0;
+
bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
if (!bio)
return -ENOMEM;
@@ -1119,7 +1119,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
char *file_name;
struct dentry *dir;
unsigned int n = 0;
- int ret = -ENOMEM;
+ int ret;
/* If the group is empty, there is nothing to do */
if (!zd->nr_zones[type])
@@ -1135,8 +1135,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
zgroup_name = "seq";
dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
- if (!dir)
+ if (!dir) {
+ ret = -ENOMEM;
goto free;
+ }
/*
* The first zone contains the super block: skip it.
@@ -1174,8 +1176,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
* Use the file number within its group as file name.
*/
snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
- if (!zonefs_create_inode(dir, file_name, zone, type))
+ if (!zonefs_create_inode(dir, file_name, zone, type)) {
+ ret = -ENOMEM;
goto free;
+ }
n++;
}