summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/file.c1
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/block_dev.c317
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/block-group.c211
-rw-r--r--fs/btrfs/block-group.h3
-rw-r--r--fs/btrfs/btrfs_inode.h11
-rw-r--r--fs/btrfs/check-integrity.c27
-rw-r--r--fs/btrfs/compression.c30
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/ctree.h127
-rw-r--r--fs/btrfs/delalloc-space.c36
-rw-r--r--fs/btrfs/delalloc-space.h10
-rw-r--r--fs/btrfs/disk-io.c107
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-io-tree.h5
-rw-r--r--fs/btrfs/extent-tree.c17
-rw-r--r--fs/btrfs/extent_io.c244
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c145
-rw-r--r--fs/btrfs/free-space-cache.c23
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c551
-rw-r--r--fs/btrfs/ioctl.c86
-rw-r--r--fs/btrfs/ordered-data.c63
-rw-r--r--fs/btrfs/ordered-data.h19
-rw-r--r--fs/btrfs/qgroup.c359
-rw-r--r--fs/btrfs/qgroup.h24
-rw-r--r--fs/btrfs/raid56.c65
-rw-r--r--fs/btrfs/ref-verify.c2
-rw-r--r--fs/btrfs/reflink.c26
-rw-r--r--fs/btrfs/relocation.c71
-rw-r--r--fs/btrfs/scrub.c153
-rw-r--r--fs/btrfs/space-info.c2
-rw-r--r--fs/btrfs/super.c144
-rw-r--r--fs/btrfs/sysfs.c163
-rw-r--r--fs/btrfs/sysfs.h7
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c2
-rw-r--r--fs/btrfs/tests/inode-tests.c14
-rw-r--r--fs/btrfs/transaction.c8
-rw-r--r--fs/btrfs/transaction.h28
-rw-r--r--fs/btrfs/tree-defrag.c5
-rw-r--r--fs/btrfs/tree-log.c50
-rw-r--r--fs/btrfs/volumes.c141
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/buffer.c11
-rw-r--r--fs/cifs/inode.c10
-rw-r--r--fs/crypto/Kconfig8
-rw-r--r--fs/crypto/Makefile1
-rw-r--r--fs/crypto/bio.c51
-rw-r--r--fs/crypto/crypto.c4
-rw-r--r--fs/crypto/fname.c45
-rw-r--r--fs/crypto/fscrypt_private.h144
-rw-r--r--fs/crypto/inline_crypt.c367
-rw-r--r--fs/crypto/keyring.c21
-rw-r--r--fs/crypto/keysetup.c91
-rw-r--r--fs/crypto/keysetup_v1.c20
-rw-r--r--fs/crypto/policy.c20
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/efivarfs/super.c6
-rw-r--r--fs/efs/super.c1
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/page-io.c6
-rw-r--r--fs/ext4/readpage.c11
-rw-r--r--fs/ext4/super.c12
-rw-r--r--fs/f2fs/compress.c2
-rw-r--r--fs/f2fs/data.c79
-rw-r--r--fs/f2fs/super.c35
-rw-r--r--fs/hfs/inode.c1
-rw-r--r--fs/internal.h17
-rw-r--r--fs/io-wq.c14
-rw-r--r--fs/io-wq.h11
-rw-r--r--fs/io_uring.c2601
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jfs/jfs_mount.c1
-rw-r--r--fs/jfs/resize.c1
-rw-r--r--fs/locks.c1
-rw-r--r--fs/nfsd/nfs4state.c20
-rw-r--r--fs/ntfs/dir.c1
-rw-r--r--fs/proc/devices.c1
-rw-r--r--fs/quota/dquot.c1
-rw-r--r--fs/reiserfs/procfs.c1
-rw-r--r--fs/squashfs/block.c2
-rw-r--r--fs/userfaultfd.c39
-rw-r--r--fs/verity/open.c15
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/xfs/xfs_pwork.c2
-rw-r--r--fs/zonefs/super.c18
92 files changed, 4301 insertions, 2742 deletions
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index a3cc8ecb50da..d553bb5bc17a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/user_namespace.h>
+#include <linux/blkdev.h>
#include "adfs.h"
#include "dir_f.h"
#include "dir_fplus.h"
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a85817f54483..a26a0f96c119 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -14,6 +14,7 @@
*/
#include <linux/uio.h>
+#include <linux/blkdev.h>
#include "affs.h"
static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 64cdf4d8e424..2482032021ca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -22,6 +22,7 @@
#include <linux/cred.h>
#include <linux/exportfs.h>
#include <linux/seq_file.h>
+#include <linux/blkdev.h>
#include "befs.h"
#include "btree.h"
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0ae656e022fd..8ae833e00443 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -105,16 +105,7 @@ EXPORT_SYMBOL(invalidate_bdev);
static void set_init_blocksize(struct block_device *bdev)
{
- unsigned bsize = bdev_logical_block_size(bdev);
- loff_t size = i_size_read(bdev->bd_inode);
-
- while (bsize < PAGE_SIZE) {
- if (size & bsize)
- break;
- bsize <<= 1;
- }
- bdev->bd_block_size = bsize;
- bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+ bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
}
int set_blocksize(struct block_device *bdev, int size)
@@ -128,9 +119,8 @@ int set_blocksize(struct block_device *bdev, int size)
return -EINVAL;
/* Don't change the size if it is same as current */
- if (bdev->bd_block_size != size) {
+ if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
sync_blockdev(bdev);
- bdev->bd_block_size = size;
bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
}
@@ -703,12 +693,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
- result = blk_queue_enter(bdev->bd_queue, 0);
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_READ);
- blk_queue_exit(bdev->bd_queue);
+ blk_queue_exit(bdev->bd_disk->queue);
return result;
}
@@ -739,7 +729,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
- result = blk_queue_enter(bdev->bd_queue, 0);
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
@@ -752,7 +742,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
clean_page_buffers(page);
unlock_page(page);
}
- blk_queue_exit(bdev->bd_queue);
+ blk_queue_exit(bdev->bd_disk->queue);
return result;
}
@@ -783,7 +773,6 @@ static void init_once(void *foo)
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_disks);
#endif
@@ -799,9 +788,6 @@ static void bdev_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
- spin_lock(&bdev_lock);
- list_del_init(&bdev->bd_list);
- spin_unlock(&bdev_lock);
/* Detach inode from wb early as bdi_put() may free bdi->wb */
inode_detach_wb(inode);
if (bdev->bd_bdi != &noop_backing_dev_info) {
@@ -876,8 +862,6 @@ static int bdev_set(struct inode *inode, void *data)
return 0;
}
-static LIST_HEAD(all_bdevs);
-
struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
@@ -895,7 +879,6 @@ struct block_device *bdget(dev_t dev)
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
- bdev->bd_block_size = i_blocksize(inode);
bdev->bd_part_count = 0;
bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
@@ -903,9 +886,6 @@ struct block_device *bdget(dev_t dev)
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
- spin_lock(&bdev_lock);
- list_add(&bdev->bd_list, &all_bdevs);
- spin_unlock(&bdev_lock);
unlock_new_inode(inode);
}
return bdev;
@@ -926,13 +906,14 @@ EXPORT_SYMBOL(bdgrab);
long nr_blockdev_pages(void)
{
- struct block_device *bdev;
+ struct inode *inode;
long ret = 0;
- spin_lock(&bdev_lock);
- list_for_each_entry(bdev, &all_bdevs, bd_list) {
- ret += bdev->bd_inode->i_mapping->nrpages;
- }
- spin_unlock(&bdev_lock);
+
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
+ ret += inode->i_mapping->nrpages;
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
+
return ret;
}
@@ -1034,30 +1015,28 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
}
/**
- * bd_prepare_to_claim - prepare to claim a block device
+ * bd_prepare_to_claim - claim a block device
* @bdev: block device of interest
* @whole: the whole device containing @bdev, may equal @bdev
* @holder: holder trying to claim @bdev
*
- * Prepare to claim @bdev. This function fails if @bdev is already
- * claimed by another holder and waits if another claiming is in
- * progress. This function doesn't actually claim. On successful
- * return, the caller has ownership of bd_claiming and bd_holder[s].
- *
- * CONTEXT:
- * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
- * it multiple times.
+ * Claim @bdev. This function fails if @bdev is already claimed by another
+ * holder and waits if another claiming is in progress. return, the caller
+ * has ownership of bd_claiming and bd_holder[s].
*
* RETURNS:
* 0 if @bdev can be claimed, -EBUSY otherwise.
*/
-static int bd_prepare_to_claim(struct block_device *bdev,
- struct block_device *whole, void *holder)
+int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
+ void *holder)
{
retry:
+ spin_lock(&bdev_lock);
/* if someone else claimed, fail */
- if (!bd_may_claim(bdev, whole, holder))
+ if (!bd_may_claim(bdev, whole, holder)) {
+ spin_unlock(&bdev_lock);
return -EBUSY;
+ }
/* if claiming is already in progress, wait for it to finish */
if (whole->bd_claiming) {
@@ -1068,13 +1047,15 @@ retry:
spin_unlock(&bdev_lock);
schedule();
finish_wait(wq, &wait);
- spin_lock(&bdev_lock);
goto retry;
}
/* yay, all mine */
+ whole->bd_claiming = holder;
+ spin_unlock(&bdev_lock);
return 0;
}
+EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
{
@@ -1097,78 +1078,6 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
return disk;
}
-/**
- * bd_start_claiming - start claiming a block device
- * @bdev: block device of interest
- * @holder: holder trying to claim @bdev
- *
- * @bdev is about to be opened exclusively. Check @bdev can be opened
- * exclusively and mark that an exclusive open is in progress. Each
- * successful call to this function must be matched with a call to
- * either bd_finish_claiming() or bd_abort_claiming() (which do not
- * fail).
- *
- * This function is used to gain exclusive access to the block device
- * without actually causing other exclusive open attempts to fail. It
- * should be used when the open sequence itself requires exclusive
- * access but may subsequently fail.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Pointer to the block device containing @bdev on success, ERR_PTR()
- * value on failure.
- */
-struct block_device *bd_start_claiming(struct block_device *bdev, void *holder)
-{
- struct gendisk *disk;
- struct block_device *whole;
- int partno, err;
-
- might_sleep();
-
- /*
- * @bdev might not have been initialized properly yet, look up
- * and grab the outer block device the hard way.
- */
- disk = bdev_get_gendisk(bdev, &partno);
- if (!disk)
- return ERR_PTR(-ENXIO);
-
- /*
- * Normally, @bdev should equal what's returned from bdget_disk()
- * if partno is 0; however, some drivers (floppy) use multiple
- * bdev's for the same physical device and @bdev may be one of the
- * aliases. Keep @bdev if partno is 0. This means claimer
- * tracking is broken for those devices but it has always been that
- * way.
- */
- if (partno)
- whole = bdget_disk(disk, 0);
- else
- whole = bdgrab(bdev);
-
- put_disk_and_module(disk);
- if (!whole)
- return ERR_PTR(-ENOMEM);
-
- /* prepare to claim, if successful, mark claiming in progress */
- spin_lock(&bdev_lock);
-
- err = bd_prepare_to_claim(bdev, whole, holder);
- if (err == 0) {
- whole->bd_claiming = holder;
- spin_unlock(&bdev_lock);
- return whole;
- } else {
- spin_unlock(&bdev_lock);
- bdput(whole);
- return ERR_PTR(err);
- }
-}
-EXPORT_SYMBOL(bd_start_claiming);
-
static void bd_clear_claiming(struct block_device *whole, void *holder)
{
lockdep_assert_held(&bdev_lock);
@@ -1181,14 +1090,14 @@ static void bd_clear_claiming(struct block_device *whole, void *holder)
/**
* bd_finish_claiming - finish claiming of a block device
* @bdev: block device of interest
- * @whole: whole block device (returned from bd_start_claiming())
+ * @whole: whole block device
* @holder: holder that has claimed @bdev
*
* Finish exclusive open of a block device. Mark the device as exlusively
* open by the holder and wake up all waiters for exclusive open to finish.
*/
-void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
- void *holder)
+static void bd_finish_claiming(struct block_device *bdev,
+ struct block_device *whole, void *holder)
{
spin_lock(&bdev_lock);
BUG_ON(!bd_may_claim(bdev, whole, holder));
@@ -1203,12 +1112,11 @@ void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
bd_clear_claiming(whole, holder);
spin_unlock(&bdev_lock);
}
-EXPORT_SYMBOL(bd_finish_claiming);
/**
* bd_abort_claiming - abort claiming of a block device
* @bdev: block device of interest
- * @whole: whole block device (returned from bd_start_claiming())
+ * @whole: whole block device
* @holder: holder that has claimed @bdev
*
* Abort claiming of a block device when the exclusive open failed. This can be
@@ -1368,26 +1276,6 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
#endif
/**
- * flush_disk - invalidates all buffer-cache entries on a disk
- *
- * @bdev: struct block device to be flushed
- * @kill_dirty: flag to guide handling of dirty inodes
- *
- * Invalidates all buffer-cache entries on a disk. It should be called
- * when a disk has been changed -- either by a media change or online
- * resize.
- */
-static void flush_disk(struct block_device *bdev, bool kill_dirty)
-{
- if (__invalidate_device(bdev, kill_dirty)) {
- printk(KERN_WARNING "VFS: busy inodes on changed media or "
- "resized disk %s\n",
- bdev->bd_disk ? bdev->bd_disk->disk_name : "");
- }
- bdev->bd_invalidated = 1;
-}
-
-/**
* check_disk_size_change - checks for disk size change and adjusts bdev size.
* @disk: struct gendisk to check
* @bdev: struct bdev to adjust.
@@ -1411,8 +1299,9 @@ static void check_disk_size_change(struct gendisk *disk,
disk->disk_name, bdev_size, disk_size);
}
i_size_write(bdev->bd_inode, disk_size);
- if (bdev_size > disk_size)
- flush_disk(bdev, false);
+ if (bdev_size > disk_size && __invalidate_device(bdev, false))
+ pr_warn("VFS: busy inodes on resized disk %s\n",
+ disk->disk_name);
}
bdev->bd_invalidated = 0;
}
@@ -1471,7 +1360,10 @@ int check_disk_change(struct block_device *bdev)
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return 0;
- flush_disk(bdev, true);
+ if (__invalidate_device(bdev, true))
+ pr_warn("VFS: busy inodes on changed media %s\n",
+ disk->disk_name);
+ bdev->bd_invalidated = 1;
if (bdops->revalidate_disk)
bdops->revalidate_disk(bdev->bd_disk);
return 1;
@@ -1547,13 +1439,15 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
* mutex_lock_nested(whole->bd_mutex, 1)
*/
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
+ int for_part)
{
+ struct block_device *whole = NULL, *claiming = NULL;
struct gendisk *disk;
int ret;
int partno;
int perm = 0;
- bool first_open = false;
+ bool first_open = false, unblock_events = true, need_restart;
if (mode & FMODE_READ)
perm |= MAY_READ;
@@ -1569,18 +1463,36 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
restart:
-
+ need_restart = false;
ret = -ENXIO;
disk = bdev_get_gendisk(bdev, &partno);
if (!disk)
goto out;
+ if (partno) {
+ whole = bdget_disk(disk, 0);
+ if (!whole) {
+ ret = -ENOMEM;
+ goto out_put_disk;
+ }
+ }
+
+ if (!for_part && (mode & FMODE_EXCL)) {
+ WARN_ON_ONCE(!holder);
+ if (whole)
+ claiming = whole;
+ else
+ claiming = bdev;
+ ret = bd_prepare_to_claim(bdev, claiming, holder);
+ if (ret)
+ goto out_put_whole;
+ }
+
disk_block_events(disk);
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
first_open = true;
bdev->bd_disk = disk;
- bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
bdev->bd_partno = partno;
@@ -1593,20 +1505,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
ret = 0;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
- if (ret == -ERESTARTSYS) {
- /* Lost a race with 'disk' being
- * deleted, try again.
- * See md.c
- */
- disk_put_part(bdev->bd_part);
- bdev->bd_part = NULL;
- bdev->bd_disk = NULL;
- bdev->bd_queue = NULL;
- mutex_unlock(&bdev->bd_mutex);
- disk_unblock_events(disk);
- put_disk_and_module(disk);
- goto restart;
- }
+ /*
+ * If we lost a race with 'disk' being deleted,
+ * try again. See md.c
+ */
+ if (ret == -ERESTARTSYS)
+ need_restart = true;
}
if (!ret) {
@@ -1627,18 +1531,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (ret)
goto out_clear;
} else {
- struct block_device *whole;
- whole = bdget_disk(disk, 0);
- ret = -ENOMEM;
- if (!whole)
- goto out_clear;
BUG_ON(for_part);
- ret = __blkdev_get(whole, mode, 1);
- if (ret) {
- bdput(whole);
+ ret = __blkdev_get(whole, mode, NULL, 1);
+ if (ret)
goto out_clear;
- }
- bdev->bd_contains = whole;
+ bdev->bd_contains = bdgrab(whole);
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1667,27 +1564,52 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_openers++;
if (for_part)
bdev->bd_part_count++;
+ if (claiming)
+ bd_finish_claiming(bdev, claiming, holder);
+
+ /*
+ * Block event polling for write claims if requested. Any write holder
+ * makes the write_holder state stick until all are released. This is
+ * good enough and tracking individual writeable reference is too
+ * fragile given the way @mode is used in blkdev_get/put().
+ */
+ if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+ (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+ bdev->bd_write_holder = true;
+ unblock_events = false;
+ }
mutex_unlock(&bdev->bd_mutex);
- disk_unblock_events(disk);
+
+ if (unblock_events)
+ disk_unblock_events(disk);
+
/* only one opener holds refs to the module and disk */
if (!first_open)
put_disk_and_module(disk);
+ if (whole)
+ bdput(whole);
return 0;
out_clear:
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
- bdev->bd_queue = NULL;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
out_unlock_bdev:
+ if (claiming)
+ bd_abort_claiming(bdev, claiming, holder);
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
+ out_put_whole:
+ if (whole)
+ bdput(whole);
+ out_put_disk:
put_disk_and_module(disk);
+ if (need_restart)
+ goto restart;
out:
-
return ret;
}
@@ -1712,50 +1634,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
*/
int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- struct block_device *whole = NULL;
int res;
- WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
-
- if ((mode & FMODE_EXCL) && holder) {
- whole = bd_start_claiming(bdev, holder);
- if (IS_ERR(whole)) {
- bdput(bdev);
- return PTR_ERR(whole);
- }
- }
-
- res = __blkdev_get(bdev, mode, 0);
-
- if (whole) {
- struct gendisk *disk = whole->bd_disk;
-
- /* finish claiming */
- mutex_lock(&bdev->bd_mutex);
- if (!res)
- bd_finish_claiming(bdev, whole, holder);
- else
- bd_abort_claiming(bdev, whole, holder);
- /*
- * Block event polling for write claims if requested. Any
- * write holder makes the write_holder state stick until
- * all are released. This is good enough and tracking
- * individual writeable reference is too fragile given the
- * way @mode is used in blkdev_get/put().
- */
- if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
- (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
- bdev->bd_write_holder = true;
- disk_block_events(disk);
- }
-
- mutex_unlock(&bdev->bd_mutex);
- bdput(whole);
- }
-
+ res =__blkdev_get(bdev, mode, holder, 0);
if (res)
bdput(bdev);
-
return res;
}
EXPORT_SYMBOL(blkdev_get);
@@ -1851,7 +1734,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
*/
filp->f_flags |= O_LARGEFILE;
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
if (filp->f_flags & O_NDELAY)
filp->f_mode |= FMODE_NDELAY;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index d888e71e66b6..ea10f7bc99ab 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1461,6 +1461,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
+ *roots = NULL;
return ret;
}
node = ulist_next(tmp, &uiter);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c037ef514b64..613920c17ac1 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -65,11 +65,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
spin_lock(&fs_info->balance_lock);
target = get_restripe_target(fs_info, flags);
if (target) {
- /* Pick target profile only if it's already available */
- if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
- spin_unlock(&fs_info->balance_lock);
- return extended_to_chunk(target);
- }
+ spin_unlock(&fs_info->balance_lock);
+ return extended_to_chunk(target);
}
spin_unlock(&fs_info->balance_lock);
@@ -118,12 +115,12 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
void btrfs_get_block_group(struct btrfs_block_group *cache)
{
- atomic_inc(&cache->count);
+ refcount_inc(&cache->refs);
}
void btrfs_put_block_group(struct btrfs_block_group *cache)
{
- if (atomic_dec_and_test(&cache->count)) {
+ if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
@@ -1111,7 +1108,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
- mutex_lock(&fs_info->chunk_mutex);
spin_lock(&block_group->lock);
block_group->removed = 1;
/*
@@ -1143,8 +1139,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
remove_em = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- mutex_unlock(&fs_info->chunk_mutex);
-
if (remove_em) {
struct extent_map_tree *em_tree;
@@ -1532,21 +1526,70 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
+static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_block_group_item bg;
+ struct extent_buffer *leaf;
+ int slot;
+ u64 flags;
+ int ret = 0;
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+
+ em_tree = &fs_info->mapping_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ key->objectid, key->offset);
+ return -ENOENT;
+ }
+
+ if (em->start != key->objectid || em->len != key->offset) {
+ btrfs_err(fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ key->objectid, key->offset, em->start, em->len);
+ ret = -EUCLEAN;
+ goto out_free_em;
+ }
+
+ read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_stack_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ key->objectid, key->offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ ret = -EUCLEAN;
+ }
+
+out_free_em:
+ free_extent_map(em);
+ return ret;
+}
+
static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key)
{
struct btrfs_root *root = fs_info->extent_root;
- int ret = 0;
+ int ret;
struct btrfs_key found_key;
struct extent_buffer *leaf;
- struct btrfs_block_group_item bg;
- u64 flags;
int slot;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
while (1) {
slot = path->slots[0];
@@ -1563,49 +1606,10 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- struct extent_map_tree *em_tree;
- struct extent_map *em;
-
- em_tree = &root->fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, found_key.objectid,
- found_key.offset);
- read_unlock(&em_tree->lock);
- if (!em) {
- btrfs_err(fs_info,
- "logical %llu len %llu found bg but no related chunk",
- found_key.objectid, found_key.offset);
- ret = -ENOENT;
- } else if (em->start != found_key.objectid ||
- em->len != found_key.offset) {
- btrfs_err(fs_info,
- "block group %llu len %llu mismatch with chunk %llu len %llu",
- found_key.objectid, found_key.offset,
- em->start, em->len);
- ret = -EUCLEAN;
- } else {
- read_extent_buffer(leaf, &bg,
- btrfs_item_ptr_offset(leaf, slot),
- sizeof(bg));
- flags = btrfs_stack_block_group_flags(&bg) &
- BTRFS_BLOCK_GROUP_TYPE_MASK;
-
- if (flags != (em->map_lookup->type &
- BTRFS_BLOCK_GROUP_TYPE_MASK)) {
- btrfs_err(fs_info,
-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
- found_key.objectid,
- found_key.offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK &
- em->map_lookup->type));
- ret = -EUCLEAN;
- } else {
- ret = 0;
- }
- }
- free_extent_map(em);
- goto out;
+ ret = read_bg_from_eb(fs_info, &found_key, path);
+ break;
}
+
path->slots[0]++;
}
out:
@@ -1657,19 +1661,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
return -EIO;
map = em->map_lookup;
- data_stripe_length = em->len;
+ data_stripe_length = em->orig_block_len;
io_stripe_size = map->stripe_len;
- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- data_stripe_length = div_u64(data_stripe_length,
- map->num_stripes / map->sub_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- data_stripe_length = div_u64(data_stripe_length,
- nr_data_stripes(map));
+ /* For RAID5/6 adjust to a full IO stripe length */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
io_stripe_size = map->stripe_len * nr_data_stripes(map);
- }
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
if (!buf) {
@@ -1748,25 +1745,12 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return ret;
while (nr--) {
- u64 start, len;
-
- if (logical[nr] > cache->start + cache->length)
- continue;
-
- if (logical[nr] + stripe_len <= cache->start)
- continue;
-
- start = logical[nr];
- if (start < cache->start) {
- start = cache->start;
- len = (logical[nr] + stripe_len) - start;
- } else {
- len = min_t(u64, stripe_len,
- cache->start + cache->length - start);
- }
+ u64 len = min_t(u64, stripe_len,
+ cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
- ret = btrfs_add_excluded_extent(fs_info, start, len);
+ ret = btrfs_add_excluded_extent(fs_info, logical[nr],
+ len);
if (ret) {
kfree(logical);
return ret;
@@ -1818,7 +1802,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
- atomic_set(&cache->count, 1);
+ refcount_set(&cache->refs, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
@@ -2207,54 +2191,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
return 0;
}
-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 num_devices;
- u64 stripped;
-
- /*
- * if restripe for this chunk_type is on pick target profile and
- * return, otherwise do the usual balance
- */
- stripped = get_restripe_target(fs_info, flags);
- if (stripped)
- return extended_to_chunk(stripped);
-
- num_devices = fs_info->fs_devices->rw_devices;
-
- stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
- BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
-
- if (num_devices == 1) {
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* turn raid0 into single device chunks */
- if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return stripped;
-
- /* turn mirroring into duplication */
- if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
- BTRFS_BLOCK_GROUP_RAID10))
- return stripped | BTRFS_BLOCK_GROUP_DUP;
- } else {
- /* they already had raid on here, just return */
- if (flags & stripped)
- return flags;
-
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* switch duplicated blocks with raid1 */
- if (flags & BTRFS_BLOCK_GROUP_DUP)
- return stripped | BTRFS_BLOCK_GROUP_RAID1;
-
- /* this is drive concat, leave it alone */
- }
-
- return flags;
-}
-
/*
* Mark one block group RO, can be called several times for the same block
* group.
@@ -2300,7 +2236,7 @@ again:
* If we are changing raid levels, try to allocate a
* corresponding block group with the new raid level.
*/
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
ret = btrfs_chunk_alloc(trans, alloc_flags,
CHUNK_ALLOC_FORCE);
@@ -2327,7 +2263,7 @@ again:
ret = inc_block_group_ro(cache, 0);
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
mutex_lock(&fs_info->chunk_mutex);
check_system_chunk(trans, alloc_flags);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2521,7 +2457,8 @@ again:
num_pages *= 16;
num_pages *= PAGE_SIZE;
- ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
+ num_pages);
if (ret)
goto out_put;
@@ -3392,7 +3329,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(list_empty(&block_group->dirty_list));
ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list));
- ASSERT(atomic_read(&block_group->count) == 1);
+ ASSERT(refcount_read(&block_group->refs) == 1);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
@@ -3447,7 +3384,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_unlock(&block_group->lock);
if (cleanup) {
- mutex_lock(&fs_info->chunk_mutex);
em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, block_group->start,
@@ -3455,7 +3391,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
BUG_ON(!em); /* logic error, can't happen */
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- mutex_unlock(&fs_info->chunk_mutex);
/* once for us and once for the tree */
free_extent_map(em);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index b6ee70a039c7..adfd7583a17b 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -114,8 +114,7 @@ struct btrfs_block_group {
/* For block groups in the same raid type */
struct list_head list;
- /* Usage count */
- atomic_t count;
+ refcount_t refs;
/*
* List of struct btrfs_free_clusters for this block group.
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e7d709505cb1..c47b6c6fea9f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -152,6 +152,17 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * The id/generation of the last transaction where this inode was
+ * either the source or the destination of a clone/dedupe operation.
+ * Used when logging an inode to know if there are shared extents that
+ * need special care when logging checksum items, to avoid duplicate
+ * checksum items in a log (which can lead to a corruption where we end
+ * up with missing checksum ranges after log replay).
+ * Protected by the vfs inode lock.
+ */
+ u64 last_reflink_trans;
+
+ /*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 32e11a23b47f..81a8c87a5afb 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -631,10 +631,8 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
int pass;
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
- if (NULL == selected_super) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!selected_super)
return -ENOMEM;
- }
list_for_each_entry(device, dev_head, dev_list) {
int i;
@@ -795,7 +793,6 @@ static int btrfsic_process_superblock_dev_mirror(
if (NULL == superblock_tmp) {
superblock_tmp = btrfsic_block_alloc();
if (NULL == superblock_tmp) {
- pr_info("btrfsic: error, kmalloc failed!\n");
ret = -1;
goto out;
}
@@ -921,9 +918,7 @@ static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
struct btrfsic_stack_frame *sf;
sf = kzalloc(sizeof(*sf), GFP_NOFS);
- if (NULL == sf)
- pr_info("btrfsic: alloc memory failed!\n");
- else
+ if (sf)
sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
return sf;
}
@@ -1313,7 +1308,6 @@ static int btrfsic_create_link_to_next_block(
if (NULL == l) {
l = btrfsic_block_link_alloc();
if (NULL == l) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(next_block_ctx);
*next_blockp = NULL;
return -1;
@@ -1470,7 +1464,6 @@ static int btrfsic_handle_extent_data(
mirror_num,
&block_was_created);
if (NULL == next_block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&next_block_ctx);
return -1;
}
@@ -2013,7 +2006,6 @@ again:
block = btrfsic_block_alloc();
if (NULL == block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&block_ctx);
goto continue_loop;
}
@@ -2234,7 +2226,6 @@ static int btrfsic_process_written_superblock(
mirror_num,
&was_created);
if (NULL == next_block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&tmp_next_block_ctx);
return -1;
}
@@ -2542,10 +2533,8 @@ static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
&state->block_link_hashtable);
if (NULL == l) {
l = btrfsic_block_link_alloc();
- if (NULL == l) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!l)
return NULL;
- }
l->block_ref_to = next_block;
l->block_ref_from = from_block;
@@ -2589,10 +2578,9 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
struct btrfsic_dev_state *dev_state;
block = btrfsic_block_alloc();
- if (NULL == block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!block)
return NULL;
- }
+
dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
if (NULL == dev_state) {
pr_info("btrfsic: error, lookup dev_state failed!\n");
@@ -2797,10 +2785,8 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
return -1;
}
state = kvzalloc(sizeof(*state), GFP_KERNEL);
- if (!state) {
- pr_info("btrfs check-integrity: allocation failed!\n");
+ if (!state)
return -ENOMEM;
- }
if (!btrfsic_is_initialized) {
mutex_init(&btrfsic_mutex);
@@ -2829,7 +2815,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
ds = btrfsic_dev_state_alloc();
if (NULL == ds) {
- pr_info("btrfs check-integrity: kmalloc() failed!\n");
mutex_unlock(&btrfsic_mutex);
return -ENOMEM;
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6e648603f85..1ab56a734e70 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -172,18 +172,17 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
}
-static int check_compressed_csum(struct btrfs_inode *inode,
- struct compressed_bio *cb,
+static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
u64 disk_start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int ret;
struct page *page;
unsigned long i;
char *kaddr;
u8 csum[BTRFS_CSUM_SIZE];
+ struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
if (inode->flags & BTRFS_INODE_NODATASUM)
@@ -201,15 +200,15 @@ static int check_compressed_csum(struct btrfs_inode *inode,
if (memcmp(&csum, cb_sum, csum_size)) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
- ret = -EIO;
- goto fail;
+ if (btrfs_io_bio(bio)->device)
+ btrfs_dev_stat_inc_and_print(
+ btrfs_io_bio(bio)->device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ return -EIO;
}
cb_sum += csum_size;
-
}
- ret = 0;
-fail:
- return ret;
+ return 0;
}
/* when we finish reading compressed pages from the disk, we
@@ -244,7 +243,6 @@ static void end_compressed_bio_read(struct bio *bio)
* Record the correct mirror_num in cb->orig_bio so that
* read-repair can work properly.
*/
- ASSERT(btrfs_io_bio(cb->orig_bio));
btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
cb->mirror_num = mirror;
@@ -256,7 +254,7 @@ static void end_compressed_bio_read(struct bio *bio)
goto csum_failed;
inode = cb->inode;
- ret = check_compressed_csum(BTRFS_I(inode), cb,
+ ret = check_compressed_csum(BTRFS_I(inode), bio,
(u64)bio->bi_iter.bi_sector << 9);
if (ret)
goto csum_failed;
@@ -405,7 +403,7 @@ out:
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
-blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
@@ -413,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
unsigned long bytes_left;
@@ -421,7 +419,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
struct page *page;
u64 first_byte = disk_start;
blk_status_t ret;
- int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
@@ -429,7 +427,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
return BLK_STS_RESOURCE;
refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
- cb->inode = inode;
+ cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
@@ -455,7 +453,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
int submit = 0;
page = compressed_pages[pg_index];
- page->mapping = inode->i_mapping;
+ page->mapping = inode->vfs_inode.i_mapping;
if (bio->bi_iter.bi_size)
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 284a3ad31350..9f3dbe372631 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -8,6 +8,8 @@
#include <linux/sizes.h>
+struct btrfs_inode;
+
/*
* We want to make sure that amount of RAM required to uncompress an extent is
* reasonable, so we limit the total size in ram of a compressed extent to
@@ -88,7 +90,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
struct bio *bio);
-blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 82ab6e5a386d..70e49d8d4f6c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1501,6 +1501,22 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
return 0;
}
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static int comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
/*
* compare two keys in a memcmp fashion
*/
@@ -1513,6 +1529,7 @@ static int comp_keys(const struct btrfs_disk_key *disk,
return btrfs_comp_cpu_keys(&k1, k2);
}
+#endif
/*
* same as comp_keys only with two btrfs_key's
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d404cce8ae40..9c7e466f27a9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -546,11 +546,6 @@ enum {
*/
BTRFS_FS_EXCL_OP,
/*
- * To info transaction_kthread we need an immediate commit so it
- * doesn't need to wait for commit_interval
- */
- BTRFS_FS_NEED_ASYNC_COMMIT,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -779,6 +774,7 @@ struct btrfs_fs_info {
u32 thread_pool_size;
struct kobject *space_info_kobj;
+ struct kobject *qgroups_kobj;
u64 total_pinned;
@@ -1011,6 +1007,8 @@ enum {
BTRFS_ROOT_DEAD_TREE,
/* The root has a log tree. Used only for subvolume roots. */
BTRFS_ROOT_HAS_LOG_TREE,
+ /* Qgroup flushing is in progress */
+ BTRFS_ROOT_QGROUP_FLUSHING,
};
/*
@@ -1059,8 +1057,10 @@ struct btrfs_root {
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
struct list_head log_ctxs[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_writers;
atomic_t log_commit[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_batch;
int log_transid;
/* No matter the commit succeeds or not*/
@@ -1075,7 +1075,6 @@ struct btrfs_root {
u64 highest_objectid;
- u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1162,6 +1161,7 @@ struct btrfs_root {
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+ wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */
atomic_t nr_swapfiles;
@@ -1277,18 +1277,18 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
BTRFS_MOUNT_##opt)
#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-{ \
+do { \
if (!btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_set_opt(fs_info->mount_opt, opt); \
-}
+} while (0)
#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-{ \
+do { \
if (btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_clear_opt(fs_info->mount_opt, opt); \
-}
+} while (0)
/*
* Requests for changes that need to be done during transaction commit.
@@ -1895,6 +1895,52 @@ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Optimized helpers for little-endian architectures where CPU and on-disk
+ * structures have the same endianness and we can skip conversions.
+ */
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
+ const struct btrfs_disk_key *disk_key)
+{
+ memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *cpu_key)
+{
+ memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_node_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_item_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *cpu_key)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_dir_item_key(eb, item, disk_key);
+}
+
+#else
+
static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
const struct btrfs_disk_key *disk)
{
@@ -1936,6 +1982,8 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
btrfs_disk_key_to_cpu(key, &disk_key);
}
+#endif
+
/* struct btrfs_header */
BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
@@ -2232,7 +2280,8 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
}
/* struct btrfs_file_extent_item */
-BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
+ type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
struct btrfs_file_extent_item, disk_bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
@@ -2241,6 +2290,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
struct btrfs_file_extent_item, generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
+ struct btrfs_file_extent_item, ram_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
struct btrfs_file_extent_item, disk_num_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
@@ -2257,6 +2308,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
}
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
disk_bytenr, 64);
BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -2508,16 +2560,46 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+/*
+ * Different levels for to flush space when doing space reservations.
+ *
+ * The higher the level, the more methods we try to reclaim space.
+ */
enum btrfs_reserve_flush_enum {
/* If we are in the transaction, we can't flush anything.*/
BTRFS_RESERVE_NO_FLUSH,
+
/*
- * Flushing delalloc may cause deadlock somewhere, in this
- * case, use FLUSH LIMIT
+ * Flush space by:
+ * - Running delayed inode items
+ * - Allocating a new chunk
*/
BTRFS_RESERVE_FLUSH_LIMIT,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Running delayed refs
+ * - Running delalloc and waiting for ordered extents
+ * - Allocating a new chunk
+ */
BTRFS_RESERVE_FLUSH_EVICT,
+
+ /*
+ * Flush space by above mentioned methods and by:
+ * - Running delayed iputs
+ * - Commiting transaction
+ *
+ * Can be interruped by fatal signal.
+ */
BTRFS_RESERVE_FLUSH_ALL,
+
+ /*
+ * Pretty much the same as FLUSH_ALL, but can also steal space from
+ * global rsv.
+ *
+ * Can be interruped by fatal signal.
+ */
BTRFS_RESERVE_FLUSH_ALL_STEAL,
};
@@ -2831,8 +2913,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
- u64 file_start, int contig);
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ u64 file_start, int contig);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
@@ -2875,7 +2957,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2928,7 +3010,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
@@ -2962,7 +3044,7 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
int skip_pinned);
extern const struct file_operations btrfs_file_operations;
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
+ struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path, u64 start, u64 end,
u64 *drop_end, int drop_cache,
int replace_extent,
@@ -2978,10 +3060,13 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes);
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3194,7 +3279,7 @@ do { \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- if ((errno) != -EIO) { \
+ if ((errno) != -EIO && (errno) != -EROFS) { \
WARN(1, KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
(errno)); \
@@ -3378,7 +3463,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_recover_relocation(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 1245739a3a6e..0e354e9e57d0 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -237,10 +237,10 @@ commit_trans:
return 0;
}
-int btrfs_check_data_free_space(struct inode *inode,
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
/* align the range */
@@ -248,14 +248,14 @@ int btrfs_check_data_free_space(struct inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
+ ret = btrfs_alloc_data_chunk_ondemand(inode, len);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0)
- btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
else
ret = 0;
return ret;
@@ -269,16 +269,12 @@ int btrfs_check_data_free_space(struct inode *inode,
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_space_info *data_sinfo;
- /* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, fs_info->sectorsize) -
- round_down(start, fs_info->sectorsize);
- start = round_down(start, fs_info->sectorsize);
+ ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
@@ -293,17 +289,17 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
* This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
-void btrfs_free_reserved_data_space(struct inode *inode,
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
/* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, root->fs_info->sectorsize) -
- round_down(start, root->fs_info->sectorsize);
- start = round_down(start, root->fs_info->sectorsize);
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
- btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
btrfs_qgroup_free_data(inode, reserved, start, len);
}
@@ -557,7 +553,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode,
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
@@ -565,7 +561,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+ ret = btrfs_delalloc_reserve_metadata(inode, len);
if (ret < 0)
btrfs_free_reserved_data_space(inode, *reserved, start, len);
return ret;
@@ -583,10 +579,10 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode,
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free)
{
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
+ btrfs_delalloc_release_metadata(inode, len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 54466fbd7075..28bf5c3ef430 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -6,18 +6,18 @@
struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
-int btrfs_check_data_free_space(struct inode *inode,
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_free_reserved_data_space(struct inode *inode,
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode,
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
-int btrfs_delalloc_reserve_space(struct inode *inode,
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b1a148058773..9ae25f632157 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1116,6 +1116,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
mutex_init(&root->delalloc_mutex);
+ init_waitqueue_head(&root->qgroup_flush_wait);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
@@ -1141,10 +1142,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- if (!dummy)
- root->defrag_trans_start = fs_info->generation;
- else
- root->defrag_trans_start = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
@@ -1395,7 +1392,12 @@ alloc_fail:
goto out;
}
-static int btrfs_init_fs_root(struct btrfs_root *root)
+/*
+ * Initialize subvolume root in-memory structure
+ *
+ * @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ */
+static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
int ret;
unsigned int nofs_flag;
@@ -1428,9 +1430,20 @@ static int btrfs_init_fs_root(struct btrfs_root *root)
spin_lock_init(&root->ino_cache_lock);
init_waitqueue_head(&root->ino_cache_wait);
- ret = get_anon_bdev(&root->anon_dev);
- if (ret)
- goto fail;
+ /*
+ * Don't assign anonymous block device to roots that are not exposed to
+ * userspace, the id pool is limited to 1M
+ */
+ if (is_fstree(root->root_key.objectid) &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ if (!anon_dev) {
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ } else {
+ root->anon_dev = anon_dev;
+ }
+ }
mutex_lock(&root->objectid_mutex);
ret = btrfs_find_highest_objectid(root,
@@ -1534,8 +1547,27 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
}
-struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, bool check_ref)
+/*
+ * Get an in-memory reference of a root structure.
+ *
+ * For essential trees like root/extent tree, we grab it from fs_info directly.
+ * For subvolume trees, we check the cached filesystem roots first. If not
+ * found, then read it from disk and add it to cached fs roots.
+ *
+ * Caller should release the root by calling btrfs_put_root() after the usage.
+ *
+ * NOTE: Reloc and log trees can't be read by this function as they share the
+ * same root objectid.
+ *
+ * @objectid: root id
+ * @anon_dev: preallocated anonymous block device number for new roots,
+ * pass 0 for new allocation.
+ * @check_ref: whether to check root item references, If true, return -ENOENT
+ * for orphan roots
+ */
+static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev,
+ bool check_ref)
{
struct btrfs_root *root;
struct btrfs_path *path;
@@ -1564,6 +1596,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
+ /* Shouldn't get preallocated anon_dev for cached roots */
+ ASSERT(!anon_dev);
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
btrfs_put_root(root);
return ERR_PTR(-ENOENT);
@@ -1583,7 +1617,7 @@ again:
goto fail;
}
- ret = btrfs_init_fs_root(root);
+ ret = btrfs_init_fs_root(root, anon_dev);
if (ret)
goto fail;
@@ -1616,25 +1650,31 @@ fail:
return ERR_PTR(ret);
}
-static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+/*
+ * Get in-memory reference of a root structure
+ *
+ * @objectid: tree objectid
+ * @check_ref: if set, verify that the tree exists and the item has at least
+ * one reference
+ */
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, bool check_ref)
{
- struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
- int ret = 0;
- struct btrfs_device *device;
- struct backing_dev_info *bdi;
+ return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+}
- rcu_read_lock();
- list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
- if (!device->bdev)
- continue;
- bdi = device->bdev->bd_bdi;
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- rcu_read_unlock();
- return ret;
+/*
+ * Get in-memory reference of a root structure, created as new, optionally pass
+ * the anonymous block device id
+ *
+ * @objectid: tree objectid
+ * @anon_dev: if zero, allocate a new anonymous block device or use the
+ * parameter value
+ */
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev)
+{
+ return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
/*
@@ -1749,7 +1789,6 @@ static int transaction_kthread(void *arg)
now = ktime_get_seconds();
if (cur->state < TRANS_STATE_COMMIT_START &&
- !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
(now < cur->start_time ||
now - cur->start_time < fs_info->commit_interval)) {
spin_unlock(&fs_info->trans_lock);
@@ -2001,8 +2040,7 @@ void btrfs_put_root(struct btrfs_root *root)
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
btrfs_drew_lock_destroy(&root->snapshot_lock);
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
+ free_root_extent_buffers(root);
kfree(root->free_ino_ctl);
kfree(root->free_ino_pinned);
#ifdef CONFIG_BTRFS_DEBUG
@@ -3053,8 +3091,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sb_buffer;
}
- sb->s_bdi->congested_fn = btrfs_congested_fn;
- sb->s_bdi->congested_data = fs_info;
sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
@@ -4058,6 +4094,11 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
+ if (btrfs_check_quota_leak(fs_info)) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err(fs_info, "qgroup reserved space leaked");
+ }
+
btrfs_free_qgroup_config(fs_info);
ASSERT(list_empty(&fs_info->delalloc_roots));
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bf43245406c4..00dc39d47ed3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index b6561455b3c4..f39d47a2d01a 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -233,14 +233,11 @@ bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
struct extent_state **cached_state);
/* This should be reworked in the future and put elsewhere. */
-int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec);
+struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
int set_state_failrec(struct extent_io_tree *tree, u64 start,
struct io_failure_record *failrec);
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
u64 end);
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret);
int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c0bc35f932bf..61ede335f6c3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5298,7 +5298,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction start. See
+ * wait_reserve_ticket and the whole reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
@@ -5466,6 +5473,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
}
}
+ /*
+ * This subvolume is going to be completely dropped, and won't be
+ * recorded as dirty roots, thus pertrans meta rsv will not be freed at
+ * commit transaction time. So free it here manually.
+ */
+ btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
+ btrfs_qgroup_free_meta_all_pertrans(root);
+
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
else
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 608f93438b29..617ea38e6fd7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1999,7 +1999,8 @@ static int __process_pages_contig(struct address_space *mapping,
if (!PageDirty(pages[i]) ||
pages[i]->mapping != mapping) {
unlock_page(pages[i]);
- put_page(pages[i]);
+ for (; i < ret; i++)
+ put_page(pages[i]);
err = -EAGAIN;
goto out;
}
@@ -2017,15 +2018,14 @@ out:
return err;
}
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned clear_bits,
unsigned long page_ops)
{
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
- NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
- __process_pages_contig(inode->i_mapping, locked_page,
+ __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start >> PAGE_SHIFT, end >> PAGE_SHIFT,
page_ops, NULL);
}
@@ -2122,12 +2122,11 @@ out:
return ret;
}
-int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec)
+struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
{
struct rb_node *node;
struct extent_state *state;
- int ret = 0;
+ struct io_failure_record *failrec;
spin_lock(&tree->lock);
/*
@@ -2136,18 +2135,19 @@ int get_state_failrec(struct extent_io_tree *tree, u64 start,
*/
node = tree_search(tree, start);
if (!node) {
- ret = -ENOENT;
+ failrec = ERR_PTR(-ENOENT);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
if (state->start != start) {
- ret = -ENOENT;
+ failrec = ERR_PTR(-ENOENT);
goto out;
}
- *failrec = state->failrec;
+
+ failrec = state->failrec;
out:
spin_unlock(&tree->lock);
- return ret;
+ return failrec;
}
/*
@@ -2377,8 +2377,8 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
if (!ret)
return 0;
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret)
+ failrec = get_state_failrec(failure_tree, start);
+ if (IS_ERR(failrec))
return 0;
BUG_ON(!failrec->this_mirror);
@@ -2450,8 +2450,8 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
spin_unlock(&failure_tree->lock);
}
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret)
+static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
+ u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct io_failure_record *failrec;
@@ -2462,65 +2462,8 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
int ret;
u64 logical;
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret) {
- failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
- if (!failrec)
- return -ENOMEM;
-
- failrec->start = start;
- failrec->len = end - start + 1;
- failrec->this_mirror = 0;
- failrec->bio_flags = 0;
- failrec->in_validation = 0;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, failrec->len);
- if (!em) {
- read_unlock(&em_tree->lock);
- kfree(failrec);
- return -EIO;
- }
-
- if (em->start > start || em->start + em->len <= start) {
- free_extent_map(em);
- em = NULL;
- }
- read_unlock(&em_tree->lock);
- if (!em) {
- kfree(failrec);
- return -EIO;
- }
-
- logical = start - em->start;
- logical = em->block_start + logical;
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- logical = em->block_start;
- failrec->bio_flags = EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&failrec->bio_flags,
- em->compress_type);
- }
-
- btrfs_debug(fs_info,
- "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
- logical, start, failrec->len);
-
- failrec->logical = logical;
- free_extent_map(em);
-
- /* set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret >= 0)
- ret = set_state_failrec(failure_tree, start, failrec);
- /* set the bits in the inode's tree */
- if (ret >= 0)
- ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
- if (ret < 0) {
- kfree(failrec);
- return ret;
- }
- } else {
+ failrec = get_state_failrec(failure_tree, start);
+ if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
failrec->logical, failrec->start, failrec->len,
@@ -2530,11 +2473,66 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
+
+ return failrec;
}
- *failrec_ret = failrec;
+ failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return ERR_PTR(-ENOMEM);
- return 0;
+ failrec->start = start;
+ failrec->len = end - start + 1;
+ failrec->this_mirror = 0;
+ failrec->bio_flags = 0;
+ failrec->in_validation = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ if (em->start > start || em->start + em->len <= start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ logical = start - em->start;
+ logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&failrec->bio_flags, em->compress_type);
+ }
+
+ btrfs_debug(fs_info,
+ "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
+ logical, start, failrec->len);
+
+ failrec->logical = logical;
+ free_extent_map(em);
+
+ /* Set the bits in the private failure tree */
+ ret = set_extent_bits(failure_tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY);
+ if (ret >= 0) {
+ ret = set_state_failrec(failure_tree, start, failrec);
+ /* Set the bits in the inode's tree */
+ ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
+ } else if (ret < 0) {
+ kfree(failrec);
+ return ERR_PTR(ret);
+ }
+
+ return failrec;
}
static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
@@ -2659,16 +2657,15 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
blk_status_t status;
- int ret;
btrfs_debug(fs_info,
"repair read error: read error at %llu", start);
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
- if (ret)
- return errno_to_blk_status(ret);
+ failrec = btrfs_get_io_failure_record(inode, start, end);
+ if (IS_ERR(failrec))
+ return errno_to_blk_status(PTR_ERR(failrec));
need_validation = btrfs_io_needs_validation(inode, failed_bio);
@@ -3419,7 +3416,7 @@ static void update_nr_written(struct writeback_control *wbc,
* This returns 0 if all went well (page still locked)
* This returns < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int writepage_delalloc(struct inode *inode,
+static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc,
u64 delalloc_start, unsigned long *nr_written)
{
@@ -3432,7 +3429,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
while (delalloc_end < page_end) {
- found = find_lock_delalloc_range(inode, page,
+ found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
if (!found) {
@@ -3449,8 +3446,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
* started, so we don't want to return > 0 unless
* things are going well.
*/
- ret = ret < 0 ? ret : -EIO;
- goto done;
+ return ret < 0 ? ret : -EIO;
}
/*
* delalloc_end is already one less than the total length, so
@@ -3482,10 +3478,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
return 1;
}
- ret = 0;
-
-done:
- return ret;
+ return 0;
}
/*
@@ -3496,7 +3489,7 @@ done:
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
struct writeback_control *wbc,
struct extent_page_data *epd,
@@ -3504,7 +3497,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
unsigned long nr_written,
int *nr_ret)
{
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
u64 end;
@@ -3536,7 +3529,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
update_nr_written(wbc, nr_written + 1);
end = page_end;
- blocksize = inode->i_sb->s_blocksize;
+ blocksize = inode->vfs_inode.i_sb->s_blocksize;
while (cur <= end) {
u64 em_end;
@@ -3547,8 +3540,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page_end, 1);
break;
}
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur,
- end - cur + 1);
+ em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
ret = PTR_ERR_OR_ZERO(em);
@@ -3585,7 +3577,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
- btrfs_err(BTRFS_I(inode)->root->fs_info,
+ btrfs_err(inode->root->fs_info,
"page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
@@ -3658,15 +3650,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
set_page_extent_mapped(page);
if (!epd->extent_locked) {
- ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
+ &nr_written);
if (ret == 1)
return 0;
if (ret)
goto done;
}
- ret = __extent_writepage_io(inode, page, wbc, epd,
- i_size, nr_written, &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
+ nr_written, &nr);
if (ret == 1)
return 0;
@@ -4126,7 +4119,7 @@ retry:
if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
ret = flush_write_bio(&epd);
} else {
- ret = -EUCLEAN;
+ ret = -EROFS;
end_write_bio(&epd, ret);
}
return ret;
@@ -4488,6 +4481,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
+ struct btrfs_fs_info *fs_info;
+ u64 cur_gen;
+
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
@@ -4501,15 +4497,45 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
free_extent_map(em);
break;
}
- if (!test_range_bit(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED, 0, NULL)) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &btrfs_inode->runtime_flags);
- remove_extent_mapping(map, em);
- /* once for the rb tree */
- free_extent_map(em);
- }
+ if (test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED, 0, NULL))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used
+ * by a fast fsync, we can remove it. If it's being
+ * logged we can safely remove it since fsync took an
+ * extra reference on the em.
+ */
+ if (list_empty(&em->list) ||
+ test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it
+ * only if its generation is older then the current one,
+ * in which case we don't need it for a fast fsync.
+ * Otherwise don't remove it, we could be racing with an
+ * ongoing fast fsync that could miss the new extent.
+ */
+ fs_info = btrfs_inode->root->fs_info;
+ spin_lock(&fs_info->trans_lock);
+ cur_gen = fs_info->generation;
+ spin_unlock(&fs_info->trans_lock);
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there
+ * is no need to set the full fsync flag on the inode (it
+ * hurts the fsync performance for workloads with a data
+ * size that exceeds or is close to the system's memory).
+ */
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
+next:
start = extent_map_end(em);
write_unlock(&map->lock);
@@ -4669,7 +4695,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
}
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+ u64 start, u64 len)
{
int ret = 0;
u64 off = start;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 87f60a48f750..00a88f2eb5ab 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -204,7 +204,7 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len);
+ u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -277,7 +277,7 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(const struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 706a3128e192..7d5ec71615b8 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -522,10 +522,10 @@ fail:
* means this bio can contains potentially discontigous bio vecs
* so the logical offset of each should be calculated separately.
*/
-blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 file_start, int contig)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b0d2c976587e..bb824c7cb7c7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -500,18 +500,18 @@ next:
* this also makes the decision about creating an inline extent vs
* doing real data extents, marking pages dirty and delalloc as required.
*/
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int err = 0;
int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
start_pos = pos & ~((u64) fs_info->sectorsize - 1);
@@ -524,13 +524,13 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, cached);
- if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (!btrfs_is_free_space_inode(inode)) {
if (start_pos >= isize &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
/*
* There can't be any extents following eof in this case
* so just set the delalloc new bit for the range
@@ -538,8 +538,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
*/
extra_bits |= EXTENT_DELALLOC_NEW;
} else {
- err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
- start_pos,
+ err = btrfs_find_new_delalloc_bytes(inode, start_pos,
num_bytes, cached);
if (err)
return err;
@@ -564,7 +563,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
* at this time.
*/
if (end_pos > isize)
- i_size_write(inode, end_pos);
+ i_size_write(&inode->vfs_inode, end_pos);
return 0;
}
@@ -731,7 +730,7 @@ next:
* is deleted from the tree.
*/
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
+ struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path, u64 start, u64 end,
u64 *drop_end, int drop_cache,
int replace_extent,
@@ -744,7 +743,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct inode *vfs_inode = &inode->vfs_inode;
+ u64 ino = btrfs_ino(inode);
u64 search_start = start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
@@ -762,9 +762,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int leafs_visited = 0;
if (drop_cache)
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
- if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
+ if (start >= inode->disk_i_size && !replace_extent)
modify_tree = 0;
update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
@@ -935,7 +935,7 @@ next_slot:
extent_end - end);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, end - key.offset);
+ inode_sub_bytes(vfs_inode, end - key.offset);
break;
}
@@ -955,7 +955,7 @@ next_slot:
start - key.offset);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, extent_end - start);
+ inode_sub_bytes(vfs_inode, extent_end - start);
if (end == extent_end)
break;
@@ -979,7 +979,7 @@ delete_extent_item:
if (update_refs &&
extent_type == BTRFS_FILE_EXTENT_INLINE) {
- inode_sub_bytes(inode,
+ inode_sub_bytes(vfs_inode,
extent_end - key.offset);
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
@@ -993,7 +993,7 @@ delete_extent_item:
key.offset - extent_offset);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
- inode_sub_bytes(inode,
+ inode_sub_bytes(vfs_inode,
extent_end - key.offset);
}
@@ -1082,8 +1082,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
- drop_cache, 0, 0, NULL);
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
+ end, NULL, drop_cache, 0, 0, NULL);
btrfs_free_path(path);
return ret;
}
@@ -1532,8 +1532,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
return ret;
}
-static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes, bool nowait)
+static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1541,6 +1541,9 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
u64 num_bytes;
int ret;
+ if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return 0;
+
if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
@@ -1583,6 +1586,42 @@ out_unlock:
return ret;
}
+static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ return check_can_nocow(inode, pos, write_bytes, true);
+}
+
+/*
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
+ *
+ * @pos: File offset
+ * @write_bytes: The length to write, will be updated to the nocow writeable
+ * range
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks.
+ *
+ * Return:
+ * >0 and update @write_bytes if we can do nocow write
+ * 0 if we can't do nocow write
+ * -EAGAIN if we can't get the needed lock or there are ordered extents
+ * for * (nowait == true) case
+ * <0 if other error happened
+ *
+ * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
+ */
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ return check_can_nocow(inode, pos, write_bytes, false);
+}
+
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
+{
+ btrfs_drew_write_unlock(&inode->root->snapshot_lock);
+}
+
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct iov_iter *i)
{
@@ -1590,7 +1629,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
@@ -1643,13 +1681,12 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
fs_info->sectorsize);
extent_changeset_release(data_reserved);
- ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &data_reserved, pos,
write_bytes);
if (ret < 0) {
- if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) &&
- check_can_nocow(BTRFS_I(inode), pos,
- &write_bytes, false) > 0) {
+ if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+ &write_bytes) > 0) {
/*
* For nodata cow case, no need to reserve
* data space.
@@ -1674,11 +1711,11 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, pos,
write_bytes);
else
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
break;
}
@@ -1748,7 +1785,7 @@ again:
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, __pos,
release_bytes, true);
}
@@ -1758,8 +1795,9 @@ again:
fs_info->sectorsize);
if (copied > 0)
- ret = btrfs_dirty_pages(inode, pages, dirty_pages,
- pos, copied, &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
+ dirty_pages, pos, copied,
+ &cached_state);
/*
* If we have not locked the extent range, because the range's
@@ -1782,7 +1820,7 @@ again:
release_bytes = 0;
if (only_release_metadata)
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos,
@@ -1800,8 +1838,6 @@ again:
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
- btrfs_btree_balance_dirty(fs_info);
pos += copied;
num_written += copied;
@@ -1811,11 +1847,12 @@ again:
if (release_bytes) {
if (only_release_metadata) {
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved,
round_down(pos, fs_info->sectorsize),
release_bytes, true);
}
@@ -1926,10 +1963,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* We will allocate space in case nodatacow is not set,
* so bail
*/
- if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) ||
- check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes,
- true) <= 0) {
+ if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
+ <= 0) {
inode_unlock(inode);
return -EAGAIN;
}
@@ -2598,7 +2633,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
cur_offset = start;
while (cur_offset < end) {
- ret = __btrfs_drop_extents(trans, root, inode, path,
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
cur_offset, end + 1, &drop_end,
1, 0, 0, NULL);
if (ret != -ENOSPC) {
@@ -3176,14 +3211,14 @@ reserve_space:
if (ret < 0)
goto out;
space_reserved = true;
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
- alloc_start, bytes_to_reserve);
- if (ret)
- goto out;
ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
if (ret)
goto out;
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
+ alloc_start, bytes_to_reserve);
+ if (ret)
+ goto out;
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
@@ -3199,7 +3234,7 @@ reserve_space:
ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
out:
if (ret && space_reserved)
- btrfs_free_reserved_data_space(inode, data_reserved,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
alloc_start, bytes_to_reserve);
extent_changeset_free(data_reserved);
@@ -3350,8 +3385,9 @@ static long btrfs_fallocate(struct file *file, int mode,
free_extent_map(em);
break;
}
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
- cur_offset, last_byte - cur_offset);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
+ &data_reserved, cur_offset,
+ last_byte - cur_offset);
if (ret < 0) {
cur_offset = last_byte;
free_extent_map(em);
@@ -3363,8 +3399,9 @@ static long btrfs_fallocate(struct file *file, int mode,
* range, free reserved data space first, otherwise
* it'll result in false ENOSPC error.
*/
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, last_byte - cur_offset);
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, cur_offset,
+ last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -3381,7 +3418,7 @@ static long btrfs_fallocate(struct file *file, int mode,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
else
- btrfs_free_reserved_data_space(inode,
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, range->start,
range->len);
list_del(&range->list);
@@ -3402,7 +3439,7 @@ out:
inode_unlock(inode);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
- btrfs_free_reserved_data_space(inode, data_reserved,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
@@ -3500,7 +3537,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return generic_file_open(inode, filp);
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 55955bd424d7..6d961e11639e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1334,8 +1334,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
- i_size_read(inode), &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
+ io_ctl->num_pages, 0, i_size_read(inode),
+ &cached_state);
if (ret)
goto out_nospc;
@@ -2703,8 +2704,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
* pointed to by the cluster, someone else raced in and freed the
* cluster already. In that case, we just return without changing anything
*/
-static int
-__btrfs_return_cluster_to_free_space(
+static void __btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
@@ -2756,7 +2756,6 @@ __btrfs_return_cluster_to_free_space(
out:
spin_unlock(&cluster->lock);
btrfs_put_block_group(block_group);
- return 0;
}
static void __btrfs_remove_free_space_cache_locked(
@@ -2907,12 +2906,11 @@ out:
* Otherwise, it'll get a reference on the block group pointed to by the
* cluster and remove the cluster from it.
*/
-int btrfs_return_cluster_to_free_space(
+void btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl;
- int ret;
/* first, get a safe pointer to the block group */
spin_lock(&cluster->lock);
@@ -2920,28 +2918,27 @@ int btrfs_return_cluster_to_free_space(
block_group = cluster->block_group;
if (!block_group) {
spin_unlock(&cluster->lock);
- return 0;
+ return;
}
} else if (cluster->block_group != block_group) {
/* someone else has already freed it don't redo their work */
spin_unlock(&cluster->lock);
- return 0;
+ return;
}
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
spin_unlock(&cluster->lock);
ctl = block_group->free_space_ctl;
/* now return any extents the cluster had on it */
spin_lock(&ctl->tree_lock);
- ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+ __btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&ctl->tree_lock);
btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
/* finally drop our ref */
btrfs_put_block_group(block_group);
- return ret;
}
static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
@@ -3358,7 +3355,7 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
list_del_init(&entry->list);
if (!ret) {
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
list_add_tail(&cluster->block_group_list,
&block_group->cluster_list);
cluster->block_group = block_group;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 2e0a8077aa74..e3d5e0ad8f8e 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -136,7 +136,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size);
-int btrfs_return_cluster_to_free_space(
+void btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster);
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 6009e0e939b5..76d2e43817ea 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -495,7 +495,8 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_SIZE;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0,
+ prealloc);
if (ret)
goto out_put;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 43c803c16b48..611b3412fbfd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -80,17 +80,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
-static noinline int cow_file_range(struct inode *inode,
+static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock);
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
- u64 orig_start, u64 block_start,
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type);
-static void __endio_write_update_ordered(struct inode *inode,
+static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate);
@@ -104,7 +104,7 @@ static void __endio_write_update_ordered(struct inode *inode,
* to be released, which we want to happen only when finishing the ordered
* extent (btrfs_finish_ordered_io()).
*/
-static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *locked_page,
u64 offset, u64 bytes)
{
@@ -116,7 +116,7 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
struct page *page;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
index++;
if (!page)
continue;
@@ -274,15 +274,15 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct inode *inode, u64 start,
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
u64 end, size_t compressed_size,
int compress_type,
struct page **compressed_pages)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
u64 actual_end = min(end + 1, isize);
u64 inline_len = actual_end - start;
u64 aligned_end = ALIGN(end, fs_info->sectorsize);
@@ -314,7 +314,7 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
btrfs_free_path(path);
return PTR_ERR(trans);
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+ trans->block_rsv = &inode->block_rsv;
if (compressed_size && compressed_pages)
extent_item_size = btrfs_file_extent_calc_inline_size(
@@ -323,9 +323,9 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
extent_item_size = btrfs_file_extent_calc_inline_size(
inline_len);
- ret = __btrfs_drop_extents(trans, root, inode, path,
- start, aligned_end, NULL,
- 1, 1, extent_item_size, &extent_inserted);
+ ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end,
+ NULL, 1, 1, extent_item_size,
+ &extent_inserted);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -334,7 +334,7 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end);
ret = insert_inline_extent(trans, path, extent_inserted,
- root, inode, start,
+ root, &inode->vfs_inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
if (ret && ret != -ENOSPC) {
@@ -345,8 +345,8 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
goto out;
}
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
- btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -412,10 +412,10 @@ static noinline int add_async_extent(struct async_chunk *cow,
/*
* Check if the inode has flags compatible with compression
*/
-static inline bool inode_can_compress(struct inode *inode)
+static inline bool inode_can_compress(struct btrfs_inode *inode)
{
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
- BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
return false;
return true;
}
@@ -424,29 +424,30 @@ static inline bool inode_can_compress(struct inode *inode)
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
-static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
+static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
+ u64 end)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!inode_can_compress(inode)) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
- btrfs_ino(BTRFS_I(inode)));
+ btrfs_ino(inode));
return 0;
}
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
/* defrag ioctl */
- if (BTRFS_I(inode)->defrag_compress)
+ if (inode->defrag_compress)
return 1;
/* bad compression ratios */
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
if (btrfs_test_opt(fs_info, COMPRESS) ||
- BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
- BTRFS_I(inode)->prop_compress)
- return btrfs_compress_heuristic(inode, start, end);
+ inode->flags & BTRFS_INODE_COMPRESS ||
+ inode->prop_compress)
+ return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
return 0;
}
@@ -552,7 +553,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (inode_need_compress(inode, start, end)) {
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -616,11 +617,12 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ 0, BTRFS_COMPRESS_NONE,
+ NULL);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(inode, start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
total_compressed,
compress_type, pages);
}
@@ -642,7 +644,8 @@ cont:
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, NULL,
+ extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
+ NULL,
clear_flags,
PAGE_UNLOCK |
PAGE_CLEAR_DIRTY |
@@ -762,14 +765,14 @@ static void free_async_extent_pages(struct async_extent *async_extent)
*/
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
{
- struct inode *inode = async_chunk->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_extent *async_extent;
u64 alloc_hint = 0;
struct btrfs_key ins;
struct extent_map *em;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct extent_io_tree *io_tree = &inode->io_tree;
int ret = 0;
again:
@@ -802,7 +805,7 @@ retry:
* all those pages down to the drive.
*/
if (!page_started && !ret)
- extent_write_locked_range(inode,
+ extent_write_locked_range(&inode->vfs_inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
@@ -832,7 +835,7 @@ retry:
* will not submit these pages down to lower
* layers.
*/
- extent_range_redirty_for_io(inode,
+ extent_range_redirty_for_io(&inode->vfs_inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1);
@@ -867,8 +870,7 @@ retry:
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
if (ret) {
- btrfs_drop_extent_cache(BTRFS_I(inode),
- async_extent->start,
+ btrfs_drop_extent_cache(inode, async_extent->start,
async_extent->start +
async_extent->ram_size - 1, 0);
goto out_free_reserve;
@@ -884,8 +886,7 @@ retry:
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK);
- if (btrfs_submit_compressed_write(inode,
- async_extent->start,
+ if (btrfs_submit_compressed_write(inode, async_extent->start,
async_extent->ram_size,
ins.objectid,
ins.offset, async_extent->pages,
@@ -896,12 +897,11 @@ retry:
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
- p->mapping = inode->i_mapping;
+ p->mapping = inode->vfs_inode.i_mapping;
btrfs_writepage_endio_finish_ordered(p, start, end, 0);
p->mapping = NULL;
- extent_clear_unlock_delalloc(inode, start, end,
- NULL, 0,
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
PAGE_END_WRITEBACK |
PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
@@ -929,10 +929,10 @@ out_free:
goto again;
}
-static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
u64 num_bytes)
{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 alloc_hint = 0;
@@ -974,13 +974,13 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
* required to start IO on it. It may be clean and already done with
* IO when we return.
*/
-static noinline int cow_file_range(struct inode *inode,
+static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
@@ -994,7 +994,7 @@ static noinline int cow_file_range(struct inode *inode,
bool extent_reserved = false;
int ret = 0;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(inode)) {
WARN_ON_ONCE(1);
ret = -EINVAL;
goto out_unlock;
@@ -1004,7 +1004,7 @@ static noinline int cow_file_range(struct inode *inode,
num_bytes = max(blocksize, num_bytes);
ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
- inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
+ inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
if (start == 0) {
/* lets try to make an inline extent */
@@ -1033,8 +1033,7 @@ static noinline int cow_file_range(struct inode *inode,
}
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
- start + num_bytes - 1, 0);
+ btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1098,7 +1097,7 @@ static noinline int cow_file_range(struct inode *inode,
* skip current ordered extent.
*/
if (ret)
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
+ btrfs_drop_extent_cache(inode, start,
start + ram_size - 1, 0);
}
@@ -1114,8 +1113,7 @@ static noinline int cow_file_range(struct inode *inode,
page_ops = unlock ? PAGE_UNLOCK : 0;
page_ops |= PAGE_SET_PRIVATE2;
- extent_clear_unlock_delalloc(inode, start,
- start + ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
@@ -1139,7 +1137,7 @@ out:
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
+ btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@@ -1236,13 +1234,13 @@ static noinline void async_cow_free(struct btrfs_work *work)
kvfree(async_chunk->pending);
}
-static int cow_file_range_async(struct inode *inode,
+static int cow_file_range_async(struct btrfs_inode *inode,
struct writeback_control *wbc,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
struct async_cow *ctx;
struct async_chunk *async_chunk;
@@ -1254,9 +1252,9 @@ static int cow_file_range_async(struct inode *inode,
unsigned nofs_flag;
const unsigned int write_flags = wbc_to_write_flags(wbc);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
!btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
num_chunks = 1;
should_compress = false;
@@ -1294,9 +1292,9 @@ static int cow_file_range_async(struct inode *inode,
* igrab is called higher up in the call chain, take only the
* lightweight reference for the callback lifetime
*/
- ihold(inode);
+ ihold(&inode->vfs_inode);
async_chunk[i].pending = &ctx->num_chunks;
- async_chunk[i].inode = inode;
+ async_chunk[i].inode = &inode->vfs_inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
async_chunk[i].write_flags = write_flags;
@@ -1373,15 +1371,15 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
return 1;
}
-static int fallback_to_cow(struct inode *inode, struct page *locked_page,
+static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const u64 start, const u64 end,
int *page_started, unsigned long *nr_written)
{
- const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
- const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid ==
+ const bool is_space_ino = btrfs_is_free_space_inode(inode);
+ const bool is_reloc_ino = (inode->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID);
const u64 range_bytes = end + 1 - start;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
u64 range_start = start;
u64 count;
@@ -1421,7 +1419,7 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
EXTENT_NORESERVE, 0);
if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count;
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_space_info *sinfo = fs_info->data_sinfo;
if (is_space_ino || is_reloc_ino)
@@ -1447,21 +1445,21 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
-static noinline int run_delalloc_nocow(struct inode *inode,
+static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
const u64 start, const u64 end,
int *page_started, int force,
unsigned long *nr_written)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
u64 cur_offset = start;
int ret;
bool check_prev = true;
- const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ const bool freespace_inode = btrfs_is_free_space_inode(inode);
+ u64 ino = btrfs_ino(inode);
bool nocow = false;
u64 disk_bytenr = 0;
@@ -1687,8 +1685,8 @@ out_check:
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
- ret = fallback_to_cow(inode, locked_page, cow_start,
- found_key.offset - 1,
+ ret = fallback_to_cow(inode, locked_page,
+ cow_start, found_key.offset - 1,
page_started, nr_written);
if (ret)
goto error;
@@ -1716,8 +1714,7 @@ out_check:
num_bytes,
BTRFS_ORDERED_PREALLOC);
if (ret) {
- btrfs_drop_extent_cache(BTRFS_I(inode),
- cur_offset,
+ btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + num_bytes - 1,
0);
goto error;
@@ -1793,11 +1790,11 @@ error:
return ret;
}
-static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end)
{
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
+ !(inode->flags & BTRFS_INODE_PREALLOC))
return 0;
/*
@@ -1805,9 +1802,8 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
* if is not zero, it means the file is defragging.
* Force cow if given extent needs to be defragged.
*/
- if (BTRFS_I(inode)->defrag_bytes &&
- test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
- EXTENT_DEFRAG, 0, NULL))
+ if (inode->defrag_bytes &&
+ test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL))
return 1;
return 0;
@@ -1817,26 +1813,25 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
int ret;
int force_cow = need_force_cow(inode, start, end);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+ if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
- } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ page_started, nr_written, 1);
} else {
- set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
page_started, nr_written);
}
@@ -2085,9 +2080,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
do_list && !(state->state & EXTENT_NORESERVE) &&
(*bits & EXTENT_CLEAR_DATA_RESV))
- btrfs_free_reserved_data_space_noquota(
- &inode->vfs_inode,
- state->start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
fs_info->delalloc_batch);
@@ -2163,7 +2156,7 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
struct inode *inode = private_data;
blk_status_t ret = 0;
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -2228,7 +2221,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
0, inode, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
if (ret)
goto out;
}
@@ -2265,13 +2258,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state)
{
WARN_ON(PAGE_ALIGNED(end));
- return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
- extra_bits, cached_state);
+ return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
+ cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -2288,7 +2281,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
struct page *page;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 page_start;
u64 page_end;
int ret = 0;
@@ -2296,7 +2289,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
- inode = fixup->inode;
+ inode = BTRFS_I(fixup->inode);
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_SIZE - 1;
@@ -2333,8 +2326,7 @@ again:
* when the page was already properly dealt with.
*/
if (!ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved,
page_start, PAGE_SIZE,
true);
@@ -2350,20 +2342,18 @@ again:
if (ret)
goto out_page;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
- &cached_state);
+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (PagePrivate2(page))
goto out_reserved;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
- PAGE_SIZE);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
- page_end, &cached_state);
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
+ &cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -2383,11 +2373,11 @@ again:
BUG_ON(!PageDirty(page));
free_delalloc_space = false;
out_reserved:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
out_page:
if (ret) {
@@ -2410,7 +2400,7 @@ out_page:
* that could need flushing space. Recursing back to fixup worker would
* deadlock.
*/
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(&inode->vfs_inode);
}
/*
@@ -2466,18 +2456,18 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
}
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct inode *inode, u64 file_pos,
- u64 disk_bytenr, u64 disk_num_bytes,
- u64 num_bytes, u64 ram_bytes,
- u8 compression, u8 encryption,
- u16 other_encoding, int extent_type)
+ struct btrfs_inode *inode, u64 file_pos,
+ struct btrfs_file_extent_item *stack_fi,
+ u64 qgroup_reserved)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_file_extent_item *fi;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
- u64 qg_released;
+ u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
+ u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
+ u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
+ u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
int extent_inserted = 0;
int ret;
@@ -2496,60 +2486,42 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
*/
ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
file_pos + num_bytes, NULL, 0,
- 1, sizeof(*fi), &extent_inserted);
+ 1, sizeof(*stack_fi), &extent_inserted);
if (ret)
goto out;
if (!extent_inserted) {
- ins.objectid = btrfs_ino(BTRFS_I(inode));
+ ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
- sizeof(*fi));
+ sizeof(*stack_fi));
if (ret)
goto out;
}
leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_set_file_extent_type(leaf, fi, extent_type);
- btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, fi, 0);
- btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
- btrfs_set_file_extent_compression(leaf, fi, compression);
- btrfs_set_file_extent_encryption(leaf, fi, encryption);
- btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+ btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
+ write_extent_buffer(leaf, stack_fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_file_extent_item));
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- inode_add_bytes(inode, num_bytes);
+ inode_add_bytes(&inode->vfs_inode, num_bytes);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos,
- ram_bytes);
+ ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
goto out;
- /*
- * Release the reserved range from inode dirty range map, as it is
- * already moved into delayed_ref_head
- */
- ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
- if (ret < 0)
- goto out;
- qg_released = ret;
- ret = btrfs_alloc_reserved_file_extent(trans, root,
- btrfs_ino(BTRFS_I(inode)),
- file_pos, qg_released, &ins);
+ ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
+ file_pos, qgroup_reserved, &ins);
out:
btrfs_free_path(path);
@@ -2571,7 +2543,33 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(cache);
}
-/* as ordered data IO finishes, this gets called so we can finish
+static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_ordered_extent *oe)
+{
+ struct btrfs_file_extent_item stack_fi;
+ u64 logical_len;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
+ oe->disk_num_bytes);
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
+ logical_len = oe->truncated_len;
+ else
+ logical_len = oe->num_bytes;
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
+ btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset,
+ &stack_fi, oe->qgroup_rsv);
+}
+
+/*
+ * As ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
@@ -2622,13 +2620,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
- /*
- * For mwrite(mmap + memset to write) case, we still reserve
- * space for NOCOW range.
- * As NOCOW won't cause a new delayed ref, just free the space
- */
- btrfs_qgroup_free_data(inode, NULL, start,
- ordered_extent->num_bytes);
btrfs_inode_safe_disk_i_size_write(inode, 0);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
@@ -2665,20 +2656,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
- btrfs_qgroup_free_data(inode, NULL, start,
- ordered_extent->num_bytes);
ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_reserved_file_extent(trans, inode, start,
- ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes,
- logical_len, logical_len,
- compress_type, 0, 0,
- BTRFS_FILE_EXTENT_REG);
+ ret = insert_ordered_extent_file_extent(trans, inode,
+ ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
@@ -2830,6 +2815,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
io_bio->mirror_num);
+ if (io_bio->device)
+ btrfs_dev_stat_inc_and_print(io_bio->device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
kunmap_atomic(kaddr);
@@ -3348,6 +3336,14 @@ cache_index:
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ /*
+ * Same logic as for last_unlink_trans. We don't persist the generation
+ * of the last transaction where this inode was used for a reflink
+ * operation, so after eviction and reloading the inode we must be
+ * pessimistic and assume the last transaction that modified the inode.
+ */
+ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+
path->slots[0]++;
if (inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
@@ -3496,7 +3492,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
fill_inode_item(trans, leaf, inode_item, inode);
btrfs_mark_buffer_dirty(leaf);
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
ret = 0;
failed:
btrfs_free_path(path);
@@ -3526,7 +3522,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
ret = btrfs_delayed_update_inode(trans, root, inode);
if (!ret)
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
return ret;
}
@@ -4041,6 +4037,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
}
}
+ free_anon_bdev(dest->anon_dev);
+ dest->anon_dev = 0;
out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
@@ -4511,11 +4509,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
char *kaddr;
+ bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
struct page *page;
gfp_t mask = btrfs_alloc_write_mask(mapping);
+ size_t write_bytes = blocksize;
int ret = 0;
u64 block_start;
u64 block_end;
@@ -4527,15 +4527,28 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
block_start = round_down(from, blocksize);
block_end = block_start + blocksize - 1;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- block_start, blocksize);
- if (ret)
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved,
+ block_start, blocksize);
+ if (ret < 0) {
+ if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start,
+ &write_bytes) > 0) {
+ /* For nocow case, no need to reserve data space */
+ only_release_metadata = true;
+ } else {
+ goto out;
+ }
+ }
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
+ if (ret < 0) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, block_start, blocksize);
goto out;
-
+ }
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
block_start, blocksize, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
ret = -ENOMEM;
@@ -4560,7 +4573,7 @@ again:
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
set_page_extent_mapped(page);
- ordered = btrfs_lookup_ordered_extent(inode, block_start);
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start);
if (ordered) {
unlock_extent_cached(io_tree, block_start, block_end,
&cached_state);
@@ -4575,7 +4588,7 @@ again:
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state);
- ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0,
&cached_state);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
@@ -4600,14 +4613,26 @@ again:
set_page_dirty(page);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
+ if (only_release_metadata)
+ set_extent_bit(&BTRFS_I(inode)->io_tree, block_start,
+ block_end, EXTENT_NORESERVE, NULL, NULL,
+ GFP_NOFS);
+
out_unlock:
- if (ret)
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ if (ret) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ blocksize, true);
+ else
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
+ block_start, blocksize, true);
+ }
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
unlock_page(page);
put_page(page);
out:
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
extent_changeset_free(data_reserved);
return ret;
}
@@ -4965,7 +4990,8 @@ static void evict_inode_truncate_pages(struct inode *inode)
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
if (state_flags & EXTENT_DELALLOC)
- btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
+ end - start + 1);
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DELALLOC |
@@ -6040,7 +6066,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode_tree_add(inode);
trace_btrfs_inode_new(inode);
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
btrfs_update_root_times(trans, root);
@@ -6849,7 +6875,7 @@ out:
return em;
}
-static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
const u64 start,
const u64 len,
const u64 orig_start,
@@ -6863,21 +6889,19 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
int ret;
if (type != BTRFS_ORDERED_NOCOW) {
- em = create_io_em(inode, start, len, orig_start,
- block_start, block_len, orig_block_len,
- ram_bytes,
+ em = create_io_em(inode, start, len, orig_start, block_start,
+ block_len, orig_block_len, ram_bytes,
BTRFS_COMPRESS_NONE, /* compress_type */
type);
if (IS_ERR(em))
goto out;
}
- ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
- len, block_len, type);
+ ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
+ block_len, type);
if (ret) {
if (em) {
free_extent_map(em);
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
- start + len - 1, 0);
+ btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
}
em = ERR_PTR(ret);
}
@@ -6886,11 +6910,11 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
return em;
}
-static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
u64 start, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_map *em;
struct btrfs_key ins;
u64 alloc_hint;
@@ -6907,15 +6931,32 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
ins.offset, BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid,
- ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
+ 1);
return em;
}
/*
- * returns 1 when the nocow is safe, < 1 on error, 0 if the
- * block must be cow'd
+ * Check if we can do nocow write into the range [@offset, @offset + @len)
+ *
+ * @offset: File offset
+ * @len: The length to write, will be updated to the nocow writeable
+ * range
+ * @orig_start: (optional) Return the original file offset of the file extent
+ * @orig_len: (optional) Return the original on-disk length of the file extent
+ * @ram_bytes: (optional) Return the ram_bytes of the file extent
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks for (nowait == false) case.
+ *
+ * Return:
+ * >0 and update @len if we can do nocow write
+ * 0 if we can't do nocow write
+ * <0 if error happened
+ *
+ * NOTE: This only checks the file extents, caller is responsible to wait for
+ * any ordered extents.
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
@@ -7142,8 +7183,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
}
/* The callers of this must take lock_extent() */
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
- u64 orig_start, u64 block_start,
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type)
@@ -7157,7 +7198,7 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
- em_tree = &BTRFS_I(inode)->extent_tree;
+ em_tree = &inode->extent_tree;
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7179,8 +7220,8 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
}
do {
- btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
- em->start + em->len - 1, 0);
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
@@ -7259,7 +7300,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
btrfs_inc_nocow_writers(fs_info, block_start)) {
struct extent_map *em2;
- em2 = btrfs_create_dio_extent(inode, start, len,
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
orig_start, block_start,
len, orig_block_len,
ram_bytes, type);
@@ -7278,8 +7319,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
* use the existing or preallocated extent, so does not
* need to adjust btrfs_space_info's bytes_may_use.
*/
- btrfs_free_reserved_data_space_noquota(inode, start,
- len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
goto skip_cow;
}
}
@@ -7287,7 +7327,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
/* this will cow the extent */
len = bh_result->b_size;
free_extent_map(em);
- *map = em = btrfs_new_extent_direct(inode, start, len);
+ *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -7438,7 +7478,8 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
return;
if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
- __endio_write_update_ordered(dip->inode, dip->logical_offset,
+ __endio_write_update_ordered(BTRFS_I(dip->inode),
+ dip->logical_offset,
dip->bytes,
!dip->dio_bio->bi_status);
} else {
@@ -7524,18 +7565,18 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
return err;
}
-static void __endio_write_update_ordered(struct inode *inode,
+static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_extent *ordered = NULL;
struct btrfs_workqueue *wq;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
u64 last_offset;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+ if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
@@ -7543,9 +7584,9 @@ static void __endio_write_update_ordered(struct inode *inode,
while (ordered_offset < offset + bytes) {
last_offset = ordered_offset;
if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate)) {
+ &ordered_offset,
+ ordered_bytes,
+ uptodate)) {
btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
NULL);
btrfs_queue_work(wq, &ordered->work);
@@ -7572,7 +7613,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
{
struct inode *inode = private_data;
blk_status_t ret;
- ret = btrfs_csum_one_bio(inode, bio, offset, 1);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -7633,7 +7674,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
if (ret)
goto err;
} else {
@@ -7883,7 +7924,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
inode_unlock(inode);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
offset, count);
if (ret)
goto out;
@@ -7915,8 +7956,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, dio_data.reserve, true);
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, offset, dio_data.reserve,
+ true);
/*
* On error we might have left some ordered extents
* without submitting corresponding bios for them, so
@@ -7925,13 +7967,13 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
*/
if (dio_data.unsubmitted_oe_range_start <
dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(inode,
+ __endio_write_update_ordered(BTRFS_I(inode),
dio_data.unsubmitted_oe_range_start,
dio_data.unsubmitted_oe_range_end -
dio_data.unsubmitted_oe_range_start,
false);
} else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
offset, count - (size_t)ret, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), count);
}
@@ -7946,7 +7988,7 @@ out:
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+ u64 start, u64 len)
{
int ret;
@@ -8123,20 +8165,17 @@ again:
/*
* Qgroup reserved space handler
* Page here will be either
- * 1) Already written to disk
- * In this case, its reserved space is released from data rsv map
- * and will be freed by delayed_ref handler finally.
- * So even we call qgroup_free_data(), it won't decrease reserved
- * space.
- * 2) Not written to disk
- * This means the reserved space should be freed here. However,
- * if a truncate invalidates the page (by clearing PageDirty)
- * and the page is accounted for while allocating extent
- * in btrfs_check_data_free_space() we let delayed_ref to
- * free the entire extent.
+ * 1) Already written to disk or ordered extent already submitted
+ * Then its QGROUP_RESERVED bit in io_tree is already cleaned.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED
+ * bit of its io_tree, and free the qgroup reserved data space.
+ * Since the IO will never happen for this page.
*/
- if (PageDirty(page))
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
@@ -8200,8 +8239,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
- reserved_space);
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ page_start, reserved_space);
if (!ret2) {
ret2 = file_update_time(vmf->vma->vm_file);
reserved = 1;
@@ -8248,9 +8287,9 @@ again:
fs_info->sectorsize);
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(inode, data_reserved,
- page_start, PAGE_SIZE - reserved_space,
- true);
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, page_start,
+ PAGE_SIZE - reserved_space, true);
}
}
@@ -8265,7 +8304,7 @@ again:
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 0, 0, &cached_state);
- ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
&cached_state);
if (ret2) {
unlock_extent_cached(io_tree, page_start, page_end,
@@ -8305,7 +8344,7 @@ out_unlock:
unlock_page(page);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
sb_end_pagefault(inode->i_sb);
@@ -8519,6 +8558,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+ ei->last_reflink_trans = 0;
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
@@ -8605,7 +8645,7 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
- btrfs_qgroup_check_reserved_leak(inode);
+ btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
inode_tree_del(inode);
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
@@ -9587,6 +9627,31 @@ out_unlock:
return err;
}
+static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct btrfs_key *ins,
+ u64 file_offset)
+{
+ struct btrfs_file_extent_item stack_fi;
+ u64 start = ins->objectid;
+ u64 len = ins->offset;
+ int ret;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
+ if (ret < 0)
+ return ret;
+ return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset,
+ &stack_fi, ret);
+}
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
@@ -9645,11 +9710,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- ret = insert_reserved_file_extent(trans, inode,
- cur_offset, ins.objectid,
- ins.offset, ins.offset,
- ins.offset, 0, 0, 0,
- BTRFS_FILE_EXTENT_PREALLOC);
+ ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
if (ret) {
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
@@ -9722,7 +9783,7 @@ next:
btrfs_end_transaction(trans);
}
if (clear_offset < end)
- btrfs_free_reserved_data_space(inode, NULL, clear_offset,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
end - clear_offset + 1);
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e8f7c5f00894..bd3511c5ca81 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -164,8 +164,11 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
return 0;
}
-/* Check if @flags are a supported and valid set of FS_*_FL flags */
-static int check_fsflags(unsigned int flags)
+/*
+ * Check if @flags are a supported and valid set of FS_*_FL flags and that
+ * the old and new flags are not conflicting
+ */
+static int check_fsflags(unsigned int old_flags, unsigned int flags)
{
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
@@ -174,9 +177,19 @@ static int check_fsflags(unsigned int flags)
FS_NOCOW_FL))
return -EOPNOTSUPP;
+ /* COMPR and NOCOMP on new/old are valid */
if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
return -EINVAL;
+ if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
+ return -EINVAL;
+
+ /* NOCOW and compression options are mutually exclusive */
+ if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+ if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+
return 0;
}
@@ -190,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
- u32 binode_flags = binode->flags;
+ u32 binode_flags;
if (!inode_owner_or_capable(inode))
return -EPERM;
@@ -201,22 +214,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
return -EFAULT;
- ret = check_fsflags(fsflags);
- if (ret)
- return ret;
-
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(inode);
-
fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+
ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
if (ret)
goto out_unlock;
+ ret = check_fsflags(old_fsflags, fsflags);
+ if (ret)
+ goto out_unlock;
+
+ binode_flags = binode->flags;
if (fsflags & FS_SYNC_FL)
binode_flags |= BTRFS_INODE_SYNC;
else
@@ -566,6 +580,7 @@ static noinline int create_subvol(struct inode *dir,
struct inode *inode;
int ret;
int err;
+ dev_t anon_dev = 0;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
@@ -578,6 +593,10 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail_free;
+ ret = get_anon_bdev(&anon_dev);
+ if (ret < 0)
+ goto fail_free;
+
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assumes subvolume qgroup's level to be 0.
@@ -660,12 +679,15 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
key.offset = (u64)-1;
- new_root = btrfs_get_fs_root(fs_info, objectid, true);
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) {
+ free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
goto fail;
}
+ /* Freeing will be done in btrfs_put_root() of new_root */
+ anon_dev = 0;
btrfs_record_root_in_trans(trans, new_root);
@@ -735,6 +757,8 @@ fail:
return ret;
fail_free:
+ if (anon_dev)
+ free_anon_bdev(anon_dev);
kfree(root_item);
return ret;
}
@@ -762,6 +786,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!pending_snapshot)
return -ENOMEM;
+ ret = get_anon_bdev(&pending_snapshot->anon_dev);
+ if (ret < 0)
+ goto free_pending;
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
GFP_KERNEL);
pending_snapshot->path = btrfs_alloc_path();
@@ -823,10 +850,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
d_instantiate(dentry, inode);
ret = 0;
+ pending_snapshot->anon_dev = 0;
fail:
+ /* Prevent double freeing of anon_dev */
+ if (ret && pending_snapshot->snap)
+ pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
free_pending:
+ if (pending_snapshot->anon_dev)
+ free_anon_bdev(pending_snapshot->anon_dev);
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
kfree(pending_snapshot);
@@ -1243,7 +1276,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
if (ret)
@@ -1265,7 +1298,7 @@ again:
while (1) {
lock_extent_bits(tree, page_start, page_end,
&cached_state);
- ordered = btrfs_lookup_ordered_extent(inode,
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
page_start);
unlock_extent_cached(tree, page_start, page_end,
&cached_state);
@@ -1333,7 +1366,7 @@ again:
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start_index << PAGE_SHIFT,
(page_cnt - i_done) << PAGE_SHIFT, true);
}
@@ -1361,7 +1394,7 @@ out:
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
@@ -3198,11 +3231,15 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u64 flags_in;
int ret = 0;
- fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
- if (!fi_args)
- return -ENOMEM;
+ fi_args = memdup_user(arg, sizeof(*fi_args));
+ if (IS_ERR(fi_args))
+ return PTR_ERR(fi_args);
+
+ flags_in = fi_args->flags;
+ memset(fi_args, 0, sizeof(*fi_args));
rcu_read_lock();
fi_args->num_devices = fs_devices->num_devices;
@@ -3218,6 +3255,23 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
+ if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
+ fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
+ fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
+ fi_args->generation = fs_info->generation;
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
+ memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
+ sizeof(fi_args->metadata_uuid));
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
+ }
+
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e13b3d28c063..ebac13389e7e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -15,6 +15,7 @@
#include "disk-io.h"
#include "compression.h"
#include "delalloc-space.h"
+#include "qgroup.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -152,23 +153,39 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
-/* allocate and add a new ordered_extent into the per-inode tree.
+/*
+ * Allocate and add a new ordered_extent into the per-inode tree.
*
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
-static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type, int dio,
int compress_type)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
+ int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
+ if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
+ /* For nocow write, we can release the qgroup rsv right now */
+ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+ } else {
+ /*
+ * The ordered extent has reserved qgroup space, release now
+ * and pass the reserved number for qgroup_record to free.
+ */
+ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
+ if (ret < 0)
+ return ret;
+ }
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
if (!entry)
return -ENOMEM;
@@ -178,9 +195,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->num_bytes = num_bytes;
entry->disk_num_bytes = disk_num_bytes;
entry->bytes_left = num_bytes;
- entry->inode = igrab(inode);
+ entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
+ entry->qgroup_rsv = ret;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -197,10 +215,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
- INIT_LIST_HEAD(&entry->log_list);
- INIT_LIST_HEAD(&entry->trans_list);
- trace_btrfs_ordered_extent_add(inode, entry);
+ trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry);
spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
@@ -228,14 +244,14 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
* that work has been done at higher layers, so this is truly the
* smallest the extent is going to get.
*/
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, 1);
+ spin_unlock(&inode->lock);
return 0;
}
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type)
{
@@ -244,7 +260,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
BTRFS_COMPRESS_NONE);
}
-int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type)
{
@@ -253,7 +269,7 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
BTRFS_COMPRESS_NONE);
}
-int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type,
int compress_type)
@@ -291,12 +307,12 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
* file_offset is updated to one byte past the range that is recorded as
* complete. This allows you to walk forward in the file.
*/
-int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 *file_offset, u64 io_size, int uptodate)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
int ret;
@@ -305,7 +321,6 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
u64 dec_start;
u64 to_dec;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
if (!node) {
@@ -429,8 +444,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
- ASSERT(list_empty(&entry->log_list));
- ASSERT(list_empty(&entry->trans_list));
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
@@ -698,14 +711,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
* find an ordered extent corresponding to file_offset. return NULL if
* nothing is found, otherwise take a reference on the extent and return it
*/
-struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -803,7 +816,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
- ordered = btrfs_lookup_ordered_extent(inode, offset);
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset);
if (!ordered)
return 0;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c01c9698250b..d61ea9c880a3 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -92,6 +92,9 @@ struct btrfs_ordered_extent {
/* compression algorithm */
int compress_type;
+ /* Qgroup reserved space */
+ int qgroup_rsv;
+
/* reference count */
refcount_t refs;
@@ -101,12 +104,6 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
- /* If we need to wait on this to be done */
- struct list_head log_list;
-
- /* If the transaction needs to wait on this ordered extent */
- struct list_head trans_list;
-
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -150,23 +147,23 @@ void btrfs_remove_ordered_extent(struct inode *inode,
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
-int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 *file_offset, u64 io_size,
int uptodate);
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type);
-int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type);
-int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type,
int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
-struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry, int wait);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5bd4089ad0e1..c0f350c3a0cf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -11,7 +11,6 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
-#include <linux/sizes.h>
#include "ctree.h"
#include "transaction.h"
@@ -22,6 +21,7 @@
#include "extent_io.h"
#include "qgroup.h"
#include "block-group.h"
+#include "sysfs.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -220,10 +220,12 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
return qgroup;
}
-static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
+static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
{
struct btrfs_qgroup_list *list;
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
list_del(&qgroup->dirty);
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups,
@@ -252,7 +254,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
- __del_qgroup_rb(qgroup);
+ __del_qgroup_rb(fs_info, qgroup);
return 0;
}
@@ -351,6 +353,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
/* default this to quota off, in case no status key is found */
fs_info->qgroup_flags = 0;
@@ -412,6 +417,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+
switch (found_key.type) {
case BTRFS_QGROUP_INFO_KEY: {
struct btrfs_qgroup_info_item *ptr;
@@ -500,12 +509,51 @@ out:
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ btrfs_sysfs_del_qgroups(fs_info);
}
return ret < 0 ? ret : 0;
}
/*
+ * Called in close_ctree() when quota is still enabled. This verifies we don't
+ * leak some reserved space.
+ *
+ * Return false if no reserved space is left.
+ * Return true if some reserved space is leaked.
+ */
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node;
+ bool ret = false;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return ret;
+ /*
+ * Since we're unmounting, there is no race and no need to grab qgroup
+ * lock. And here we don't go post-order to provide a more user
+ * friendly sorted result.
+ */
+ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
+ struct btrfs_qgroup *qgroup;
+ int i;
+
+ qgroup = rb_entry(node, struct btrfs_qgroup, node);
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
+ if (qgroup->rsv.values[i]) {
+ ret = true;
+ btrfs_warn(fs_info,
+ "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ i, qgroup->rsv.values[i]);
+ }
+ }
+ }
+ return ret;
+}
+
+/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
* first two are in single-threaded paths.And for the third one, we have set
* quota_root to be null with qgroup_lock held before, so it is safe to clean
@@ -519,7 +567,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
- __del_qgroup_rb(qgroup);
+ __del_qgroup_rb(fs_info, qgroup);
}
/*
* We call btrfs_free_qgroup_config() when unmounting
@@ -528,6 +576,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
*/
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
+ btrfs_sysfs_del_qgroups(fs_info);
}
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
@@ -900,6 +949,9 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
goto out;
}
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
@@ -987,6 +1039,11 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
}
ret = btrfs_next_item(tree_root, path);
if (ret < 0) {
@@ -1011,6 +1068,11 @@ out_add_root:
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
ret = btrfs_commit_transaction(trans);
trans = NULL;
@@ -1046,6 +1108,7 @@ out:
fs_info->qgroup_ulist = NULL;
if (trans)
btrfs_end_transaction(trans);
+ btrfs_sysfs_del_qgroups(fs_info);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -1398,8 +1461,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
qgroup = add_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
- if (IS_ERR(qgroup))
+ if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
+ goto out;
+ }
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -2818,6 +2884,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
unlock:
spin_unlock(&fs_info->qgroup_lock);
+ if (!ret)
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
@@ -2826,20 +2894,8 @@ out:
return ret;
}
-/*
- * Two limits to commit transaction in advance.
- *
- * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
- * For SIZE, it will be in byte unit as threshold.
- */
-#define QGROUP_FREE_RATIO 32
-#define QGROUP_FREE_SIZE SZ_32M
-static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
- const struct btrfs_qgroup *qg, u64 num_bytes)
+static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
- u64 free;
- u64 threshold;
-
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
@@ -2848,32 +2904,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
- /*
- * Even if we passed the check, it's better to check if reservation
- * for meta_pertrans is pushing us near limit.
- * If there is too much pertrans reservation or it's near the limit,
- * let's try commit transaction to free some, using transaction_kthread
- */
- if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
- BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
- free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
- threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- } else {
- free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
- threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- }
-
- /*
- * Use transaction_kthread to commit transaction, so we no
- * longer need to bother nested transaction nor lock context.
- */
- if (free < threshold)
- btrfs_commit_transaction_locksafe(fs_info);
- }
-
return true;
}
@@ -2921,7 +2951,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
+ if (enforce && !qgroup_check_limits(qg, num_bytes)) {
ret = -EDQUOT;
goto out;
}
@@ -3378,28 +3408,132 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
}
}
+#define rbtree_iterate_from_safe(node, next, start) \
+ for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
+
+static int qgroup_unreserve_range(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len)
+{
+ struct rb_node *node;
+ struct rb_node *next;
+ struct ulist_node *entry = NULL;
+ int ret = 0;
+
+ node = reserved->range_changed.root.rb_node;
+ while (node) {
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ if (entry->val < start)
+ node = node->rb_right;
+ else if (entry)
+ node = node->rb_left;
+ else
+ break;
+ }
+
+ /* Empty changeset */
+ if (!entry)
+ return 0;
+
+ if (entry->val > start && rb_prev(&entry->rb_node))
+ entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
+ rb_node);
+
+ rbtree_iterate_from_safe(node, next, &entry->rb_node) {
+ u64 entry_start;
+ u64 entry_end;
+ u64 entry_len;
+ int clear_ret;
+
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ entry_start = entry->val;
+ entry_end = entry->aux;
+ entry_len = entry_end - entry_start + 1;
+
+ if (entry_start >= start + len)
+ break;
+ if (entry_start + entry_len <= start)
+ continue;
+ /*
+ * Now the entry is in [start, start + len), revert the
+ * EXTENT_QGROUP_RESERVED bit.
+ */
+ clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
+ entry_end, EXTENT_QGROUP_RESERVED);
+ if (!ret && clear_ret < 0)
+ ret = clear_ret;
+
+ ulist_del(&reserved->range_changed, entry->val, entry->aux);
+ if (likely(reserved->bytes_changed >= entry_len)) {
+ reserved->bytes_changed -= entry_len;
+ } else {
+ WARN_ON(1);
+ reserved->bytes_changed = 0;
+ }
+ }
+
+ return ret;
+}
+
/*
- * Reserve qgroup space for range [start, start + len).
+ * Try to free some space for qgroup.
*
- * This function will either reserve space from related qgroups or doing
- * nothing if the range is already reserved.
+ * For qgroup, there are only 3 ways to free qgroup space:
+ * - Flush nodatacow write
+ * Any nodatacow write will free its reserved data space at run_delalloc_range().
+ * In theory, we should only flush nodatacow inodes, but it's not yet
+ * possible, so we need to flush the whole root.
*
- * Return 0 for successful reserve
- * Return <0 for error (including -EQUOT)
+ * - Wait for ordered extents
+ * When ordered extents are finished, their reserved metadata is finally
+ * converted to per_trans status, which can be freed by later commit
+ * transaction.
*
- * NOTE: this function may sleep for memory allocation.
- * if btrfs_qgroup_reserve_data() is called multiple times with
- * same @reserved, caller must ensure when error happens it's OK
- * to free *ALL* reserved space.
+ * - Commit transaction
+ * This would free the meta_per_trans space.
+ * In theory this shouldn't provide much space, but any more qgroup space
+ * is needed.
*/
-int btrfs_qgroup_reserve_data(struct inode *inode,
+static int try_flush_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /*
+ * We don't want to run flush again and again, so if there is a running
+ * one, we won't try to start a new flush, but exit directly.
+ */
+ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
+ wait_event(root->qgroup_flush_wait,
+ !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
+ return 0;
+ }
+
+ ret = btrfs_start_delalloc_snapshot(root);
+ if (ret < 0)
+ goto out;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = btrfs_commit_transaction(trans);
+out:
+ clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
+ wake_up(&root->qgroup_flush_wait);
+ return ret;
+}
+
+static int qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ struct btrfs_root *root = inode->root;
struct extent_changeset *reserved;
+ bool new_reserved = false;
u64 orig_reserved;
u64 to_reserve;
int ret;
@@ -3412,6 +3546,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
if (WARN_ON(!reserved_ret))
return -EINVAL;
if (!*reserved_ret) {
+ new_reserved = true;
*reserved_ret = extent_changeset_alloc();
if (!*reserved_ret)
return -ENOMEM;
@@ -3419,15 +3554,15 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
- ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ ret = set_record_extent_bits(&inode->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
- trace_btrfs_qgroup_reserve_data(inode, start, len,
+ trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
to_reserve, QGROUP_RESERVE);
if (ret < 0)
- goto cleanup;
+ goto out;
ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
if (ret < 0)
goto cleanup;
@@ -3435,23 +3570,49 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
return ret;
cleanup:
- /* cleanup *ALL* already reserved ranges */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(&reserved->range_changed, &uiter)))
- clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
- unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
- /* Also free data bytes of already reserved one */
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
- orig_reserved, BTRFS_QGROUP_RSV_DATA);
- extent_changeset_release(reserved);
+ qgroup_unreserve_range(inode, reserved, start, len);
+out:
+ if (new_reserved) {
+ extent_changeset_release(reserved);
+ kfree(reserved);
+ *reserved_ret = NULL;
+ }
return ret;
}
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or do nothing
+ * if the range is already reserved.
+ *
+ * Return 0 for successful reservation
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: This function may sleep for memory allocation, dirty page flushing and
+ * commit transaction. So caller should not hold any dirty page locked.
+ */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+ struct extent_changeset **reserved_ret, u64 start,
+ u64 len)
+{
+ int ret;
+
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(inode->root);
+ if (ret < 0)
+ return ret;
+ return qgroup_reserve_data(inode, reserved_ret, start, len);
+}
+
/* Free ranges specified by @reserved, normally in error path */
-static int qgroup_free_reserved_data(struct inode *inode,
+static int qgroup_free_reserved_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct extent_changeset changeset;
@@ -3487,8 +3648,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
- free_start, free_start + free_len - 1,
+ ret = clear_record_extent_bits(&inode->io_tree, free_start,
+ free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -3502,7 +3663,7 @@ out:
return ret;
}
-static int __btrfs_qgroup_release_data(struct inode *inode,
+static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len,
int free)
{
@@ -3510,8 +3671,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
- &BTRFS_I(inode)->root->fs_info->flags))
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
return 0;
/* In release case, we shouldn't have @reserved */
@@ -3519,18 +3679,18 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len);
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
+ ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
+ EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
if (free)
trace_op = QGROUP_FREE;
- trace_btrfs_qgroup_release_data(inode, start, len,
+ trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
changeset.bytes_changed, trace_op);
if (free)
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
@@ -3550,7 +3710,7 @@ out:
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_free_data(struct inode *inode,
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
@@ -3571,7 +3731,7 @@ int btrfs_qgroup_free_data(struct inode *inode,
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
@@ -3616,7 +3776,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
return num_bytes;
}
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3643,6 +3803,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return ret;
}
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
+{
+ int ret;
+
+ ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(root);
+ if (ret < 0)
+ return ret;
+ return qgroup_reserve_meta(root, num_bytes, type, enforce);
+}
+
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3742,7 +3917,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
* Check qgroup reserved space leaking, normally at destroy inode
* time
*/
-void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
{
struct extent_changeset changeset;
struct ulist_node *unode;
@@ -3750,19 +3925,19 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
int ret;
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
ULIST_ITER_INIT(&iter);
while ((unode = ulist_next(&changeset.range_changed, &iter))) {
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
- "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
- inode->i_ino, unode->val, unode->aux);
+ btrfs_warn(inode->root->fs_info,
+ "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
+ btrfs_ino(inode), unode->val, unode->aux);
}
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 1bc654459469..50dea9a2d8fb 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -8,6 +8,7 @@
#include <linux/spinlock.h>
#include <linux/rbtree.h>
+#include <linux/kobject.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -223,8 +224,18 @@ struct btrfs_qgroup {
*/
u64 old_refcnt;
u64 new_refcnt;
+
+ /*
+ * Sysfs kobjectid
+ */
+ struct kobject kobj;
};
+static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
+{
+ return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
+}
+
/*
* For qgroup event trace points only
*/
@@ -344,12 +355,12 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
#endif
/* New io_tree based accurate qgroup reserve API */
-int btrfs_qgroup_reserve_data(struct inode *inode,
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
-int btrfs_qgroup_free_data(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len);
-
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
/* Reserve metadata space for pertrans and prealloc type */
@@ -399,7 +410,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
*/
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
-void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
void btrfs_qgroup_init_swapped_blocks(
@@ -415,5 +426,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index c870ef70f817..255490f42b5d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1083,7 +1083,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
unsigned long bio_max_len)
{
struct bio *last = bio_list->tail;
- u64 last_end = 0;
int ret;
struct bio *bio;
struct btrfs_bio_stripe *stripe;
@@ -1098,15 +1097,14 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* see if we can add this page onto our existing bio */
if (last) {
- last_end = (u64)last->bi_iter.bi_sector << 9;
+ u64 last_end = (u64)last->bi_iter.bi_sector << 9;
last_end += last->bi_iter.bi_size;
/*
* we can't merge these if they are from different
* devices or if they are not contiguous
*/
- if (last_end == disk_start && stripe->dev->bdev &&
- !last->bi_status &&
+ if (last_end == disk_start && !last->bi_status &&
last->bi_disk == stripe->dev->bdev->bd_disk &&
last->bi_partno == stripe->dev->bdev->bd_partno) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
@@ -1117,6 +1115,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* put a new bio on the list */
bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+ btrfs_io_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1325,11 +1324,7 @@ write_data:
atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
@@ -1354,7 +1349,6 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
u64 physical = bio->bi_iter.bi_sector;
- u64 stripe_start;
int i;
struct btrfs_bio_stripe *stripe;
@@ -1362,9 +1356,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->bbio->num_stripes; i++) {
stripe = &rbio->bbio->stripes[i];
- stripe_start = stripe->physical;
- if (physical >= stripe_start &&
- physical < stripe_start + rbio->stripe_len &&
+ if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev &&
bio->bi_disk == stripe->dev->bdev->bd_disk &&
bio->bi_partno == stripe->dev->bdev->bd_partno) {
@@ -1382,18 +1374,14 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
- u64 logical = bio->bi_iter.bi_sector;
- u64 stripe_start;
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
int i;
- logical <<= 9;
-
for (i = 0; i < rbio->nr_data; i++) {
- stripe_start = rbio->bbio->raid_map[i];
- if (logical >= stripe_start &&
- logical < stripe_start + rbio->stripe_len) {
+ u64 stripe_start = rbio->bbio->raid_map[i];
+
+ if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
- }
}
return -1;
}
@@ -1567,11 +1555,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
@@ -1878,11 +1862,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* make sure our ps and qs are in order */
- if (faila > failb) {
- int tmp = failb;
- failb = faila;
- faila = tmp;
- }
+ if (faila > failb)
+ swap(faila, failb);
/* if the q stripe is failed, do a pstripe reconstruction
* from the xors.
@@ -2102,7 +2083,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
*/
if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
__raid_recover_end_io(rbio);
- goto out;
+ return 0;
} else {
goto cleanup;
}
@@ -2113,11 +2094,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
@@ -2126,7 +2103,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
submit_bio(bio);
}
-out:
+
return 0;
cleanup:
@@ -2482,11 +2459,7 @@ submit_write:
atomic_set(&rbio->stripes_pending, nr_data);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
@@ -2664,11 +2637,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index af92525dbb16..7f03dbe5b609 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -286,6 +286,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
exist_re = insert_root_entry(&exist->roots, re);
if (exist_re)
kfree(re);
+ } else {
+ kfree(re);
}
kfree(be);
return exist;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 040009d1cc31..5cd02514cf4d 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -68,8 +68,8 @@ static int copy_inline_to_page(struct inode *inode,
* reservation here. Also we must not do the reservation while holding
* a transaction open, otherwise we would deadlock.
*/
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
- block_size);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ file_offset, block_size);
if (ret)
goto out;
@@ -84,7 +84,8 @@ static int copy_inline_to_page(struct inode *inode,
clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
- ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end,
+ 0, NULL);
if (ret)
goto out_unlock;
@@ -133,8 +134,8 @@ out_unlock:
put_page(page);
}
if (ret)
- btrfs_delalloc_release_space(inode, data_reserved, file_offset,
- block_size, true);
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
+ file_offset, block_size, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
out:
extent_changeset_free(data_reserved);
@@ -336,6 +337,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
while (1) {
u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
+ u64 extent_gen;
int type;
u32 size;
struct btrfs_key new_key;
@@ -384,6 +386,7 @@ process_slot:
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
+ extent_gen = btrfs_file_extent_generation(leaf, extent);
comp = btrfs_file_extent_compression(leaf, extent);
type = btrfs_file_extent_type(leaf, extent);
if (type == BTRFS_FILE_EXTENT_REG ||
@@ -488,6 +491,19 @@ process_slot:
btrfs_release_path(path);
+ /*
+ * If this is a new extent update the last_reflink_trans of both
+ * inodes. This is used by fsync to make sure it does not log
+ * multiple checksum items with overlapping ranges. For older
+ * extents we don't need to do it since inode logging skips the
+ * checksums for older extents. Also ignore holes and inline
+ * extents because they don't have checksums in the csum tree.
+ */
+ if (extent_gen == trans->transid && disko > 0) {
+ BTRFS_I(src)->last_reflink_trans = trans->transid;
+ BTRFS_I(inode)->last_reflink_trans = trans->transid;
+ }
+
last_dest_end = ALIGN(new_key.offset + datal,
fs_info->sectorsize);
ret = clone_finish_inode_update(trans, inode, last_dest_end,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3bbae80c752f..4ba1ab9cc76d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1686,12 +1686,20 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_unlock_up_safe(path, 0);
}
- min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ /*
+ * In merge_reloc_root(), we modify the upper level pointer to swap the
+ * tree blocks between reloc tree and subvolume tree. Thus for tree
+ * block COW, we COW at most from level 1 to root level for each tree.
+ *
+ * Thus the needed metadata size is at most root_level * nodesize,
+ * and * 2 since we have two trees to COW.
+ */
+ min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {
ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
- BTRFS_RESERVE_FLUSH_ALL);
+ BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
err = ret;
goto out;
@@ -2571,58 +2579,50 @@ out_free_blocks:
return err;
}
-static noinline_for_stack
-int prealloc_file_extent_cluster(struct inode *inode,
- struct file_extent_cluster *cluster)
+static noinline_for_stack int prealloc_file_extent_cluster(
+ struct btrfs_inode *inode,
+ struct file_extent_cluster *cluster)
{
u64 alloc_hint = 0;
u64 start;
u64 end;
- u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 offset = inode->index_cnt;
u64 num_bytes;
- int nr = 0;
+ int nr;
int ret = 0;
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
- u64 cur_offset;
- struct extent_changeset *data_reserved = NULL;
+ u64 cur_offset = prealloc_start;
BUG_ON(cluster->start != cluster->boundary[0]);
- inode_lock(inode);
-
- ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
- prealloc_end + 1 - prealloc_start);
+ ret = btrfs_alloc_data_chunk_ondemand(inode,
+ prealloc_end + 1 - prealloc_start);
if (ret)
- goto out;
+ return ret;
- cur_offset = prealloc_start;
- while (nr < cluster->nr) {
+ inode_lock(&inode->vfs_inode);
+ for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
end = cluster->boundary[nr + 1] - 1 - offset;
else
end = cluster->end - offset;
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ lock_extent(&inode->io_tree, start, end);
num_bytes = end + 1 - start;
- if (cur_offset < start)
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, start - cur_offset);
- ret = btrfs_prealloc_file_range(inode, 0, start,
+ ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end);
if (ret)
break;
- nr++;
}
+ inode_unlock(&inode->vfs_inode);
+
if (cur_offset < prealloc_end)
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, prealloc_end + 1 - cur_offset);
-out:
- inode_unlock(inode);
- extent_changeset_free(data_reserved);
+ btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
+ prealloc_end + 1 - cur_offset);
return ret;
}
@@ -2664,7 +2664,8 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
*/
int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
{
- return atomic_read(&fs_info->balance_cancel_req);
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ fatal_signal_pending(current);
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
@@ -2690,7 +2691,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
- ret = prealloc_file_extent_cluster(inode, cluster);
+ ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
if (ret)
goto out;
@@ -2762,8 +2763,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
nr++;
}
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
- NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
+ page_end, 0, NULL);
if (ret) {
unlock_page(page);
put_page(page);
@@ -3872,9 +3873,9 @@ out:
* cloning checksum properly handles the nodatasum extents.
* it also saves CPU time to re-calculate the checksum.
*/
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered;
int ret;
@@ -3885,7 +3886,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+ disk_bytenr = file_pos + inode->index_cnt;
ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
if (ret)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 016a025e36c7..5a6cb9db512e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1616,13 +1616,9 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
struct scrub_page *spage = sblock->pagev[page_num];
BUG_ON(spage->page == NULL);
- if (spage->io_error) {
- void *mapped_buffer = kmap_atomic(spage->page);
+ if (spage->io_error)
+ clear_page(page_address(spage->page));
- clear_page(mapped_buffer);
- flush_dcache_page(spage->page);
- kunmap_atomic(mapped_buffer);
- }
return scrub_add_page_to_wr_bio(sblock->sctx, spage);
}
@@ -1790,42 +1786,21 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
- u8 *on_disk_csum;
- struct page *page;
- void *buffer;
- u64 len;
- int index;
+ struct scrub_page *spage;
+ char *kaddr;
BUG_ON(sblock->page_count < 1);
- if (!sblock->pagev[0]->have_csum)
+ spage = sblock->pagev[0];
+ if (!spage->have_csum)
return 0;
+ kaddr = page_address(spage->page);
+
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
+ crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
- on_disk_csum = sblock->pagev[0]->csum;
- page = sblock->pagev[0]->page;
- buffer = kmap_atomic(page);
-
- len = sctx->fs_info->sectorsize;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, PAGE_SIZE);
-
- crypto_shash_update(shash, buffer, l);
- kunmap_atomic(buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- buffer = kmap_atomic(page);
- }
-
- crypto_shash_final(shash, csum);
- if (memcmp(csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(csum, spage->csum, sctx->csum_size))
sblock->checksum_error = 1;
return sblock->checksum_error;
@@ -1839,20 +1814,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
- struct page *page;
- void *mapped_buffer;
- u64 mapped_size;
- void *p;
- u64 len;
- int index;
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
+ const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
+ int i;
+ struct scrub_page *spage;
+ char *kaddr;
BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0]->page;
- mapped_buffer = kmap_atomic(page);
- h = (struct btrfs_header *)mapped_buffer;
+ spage = sblock->pagev[0];
+ kaddr = page_address(spage->page);
+ h = (struct btrfs_header *)kaddr;
memcpy(on_disk_csum, h->csum, sctx->csum_size);
/*
@@ -1860,40 +1830,29 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
+ if (spage->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+ if (spage->generation != btrfs_stack_header_generation(h)) {
sblock->header_error = 1;
sblock->generation_error = 1;
}
- if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
+ if (!scrub_check_fsid(h->fsid, spage))
sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
sblock->header_error = 1;
- len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, mapped_size);
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+ PAGE_SIZE - BTRFS_CSUM_SIZE);
- crypto_shash_update(shash, p, l);
- kunmap_atomic(mapped_buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- mapped_buffer = kmap_atomic(page);
- mapped_size = PAGE_SIZE;
- p = mapped_buffer;
+ for (i = 1; i < num_pages; i++) {
+ kaddr = page_address(sblock->pagev[i]->page);
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
crypto_shash_final(shash, calculated_csum);
@@ -1910,57 +1869,31 @@ static int scrub_checksum_super(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
- u8 on_disk_csum[BTRFS_CSUM_SIZE];
- struct page *page;
- void *mapped_buffer;
- u64 mapped_size;
- void *p;
+ struct scrub_page *spage;
+ char *kaddr;
int fail_gen = 0;
int fail_cor = 0;
- u64 len;
- int index;
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0]->page;
- mapped_buffer = kmap_atomic(page);
- s = (struct btrfs_super_block *)mapped_buffer;
- memcpy(on_disk_csum, s->csum, sctx->csum_size);
+ spage = sblock->pagev[0];
+ kaddr = page_address(spage->page);
+ s = (struct btrfs_super_block *)kaddr;
- if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
+ if (spage->logical != btrfs_super_bytenr(s))
++fail_cor;
- if (sblock->pagev[0]->generation != btrfs_super_generation(s))
+ if (spage->generation != btrfs_super_generation(s))
++fail_gen;
- if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
+ if (!scrub_check_fsid(s->fsid, spage))
++fail_cor;
- len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, mapped_size);
-
- crypto_shash_update(shash, p, l);
- kunmap_atomic(mapped_buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- mapped_buffer = kmap_atomic(page);
- mapped_size = PAGE_SIZE;
- p = mapped_buffer;
- }
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
- crypto_shash_final(shash, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(calculated_csum, s->csum, sctx->csum_size))
++fail_cor;
if (fail_cor + fail_gen) {
@@ -1973,10 +1906,10 @@ static int scrub_checksum_super(struct scrub_block *sblock)
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
if (fail_cor)
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+ btrfs_dev_stat_inc_and_print(spage->dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
else
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+ btrfs_dev_stat_inc_and_print(spage->dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
}
@@ -3758,7 +3691,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
- return -EIO;
+ return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
if (scrub_dev->fs_devices != fs_info->fs_devices)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index c7bd3fdd7792..475968ccbd1d 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -468,8 +468,8 @@ again:
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
cache->start, cache->length, cache->used, cache->pinned,
cache->reserved, cache->ro ? "[readonly]" : "");
- btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
+ btrfs_dump_free_space(cache, bytes);
}
if (++index < BTRFS_NR_RAID_TYPES)
goto again;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c3826ae883f0..5a9dc31d95c9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -67,6 +67,21 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+/*
+ * Generally the error codes correspond to their respective errors, but there
+ * are a few special cases.
+ *
+ * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
+ * instance will return EUCLEAN if any of the blocks are corrupted in
+ * a way that is problematic. We want to reserve EUCLEAN for these
+ * sort of corruptions.
+ *
+ * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
+ * need to use EROFS for this case. We will have no idea of the
+ * original failure, that will have been reported at the time we tripped
+ * over the error. Each subsequent error that doesn't have any context
+ * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
+ */
const char * __attribute_const__ btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -326,7 +341,6 @@ enum {
Opt_defrag, Opt_nodefrag,
Opt_discard, Opt_nodiscard,
Opt_discard_mode,
- Opt_nologreplay,
Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
@@ -340,13 +354,15 @@ enum {
Opt_subvolid,
Opt_thread_pool,
Opt_treelog, Opt_notreelog,
- Opt_usebackuproot,
Opt_user_subvol_rm_allowed,
+ /* Rescue options */
+ Opt_rescue,
+ Opt_usebackuproot,
+ Opt_nologreplay,
+
/* Deprecated options */
- Opt_alloc_start,
Opt_recovery,
- Opt_subvolrootid,
/* Debugging options */
Opt_check_integrity,
@@ -390,7 +406,6 @@ static const match_table_t tokens = {
{Opt_discard, "discard"},
{Opt_discard_mode, "discard=%s"},
{Opt_nodiscard, "nodiscard"},
- {Opt_nologreplay, "nologreplay"},
{Opt_norecovery, "norecovery"},
{Opt_ratio, "metadata_ratio=%u"},
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
@@ -408,13 +423,17 @@ static const match_table_t tokens = {
{Opt_thread_pool, "thread_pool=%u"},
{Opt_treelog, "treelog"},
{Opt_notreelog, "notreelog"},
- {Opt_usebackuproot, "usebackuproot"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+ /* Rescue options */
+ {Opt_rescue, "rescue=%s"},
+ /* Deprecated, with alias rescue=nologreplay */
+ {Opt_nologreplay, "nologreplay"},
+ /* Deprecated, with alias rescue=usebackuproot */
+ {Opt_usebackuproot, "usebackuproot"},
+
/* Deprecated options */
- {Opt_alloc_start, "alloc_start=%s"},
{Opt_recovery, "recovery"},
- {Opt_subvolrootid, "subvolrootid=%d"},
/* Debugging options */
{Opt_check_integrity, "check_int"},
@@ -433,6 +452,55 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
+static const match_table_t rescue_tokens = {
+ {Opt_usebackuproot, "usebackuproot"},
+ {Opt_nologreplay, "nologreplay"},
+ {Opt_err, NULL},
+};
+
+static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+{
+ char *opts;
+ char *orig;
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int ret = 0;
+
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ":")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+ token = match_token(p, rescue_tokens, args);
+ switch (token){
+ case Opt_usebackuproot:
+ btrfs_info(info,
+ "trying to use backup root at mount time");
+ btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ break;
+ case Opt_nologreplay:
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
+ case Opt_err:
+ btrfs_info(info, "unrecognized rescue option '%s'", p);
+ ret = -EINVAL;
+ goto out;
+ default:
+ break;
+ }
+
+ }
+out:
+ kfree(orig);
+ return ret;
+}
+
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
@@ -479,7 +547,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_subvol:
case Opt_subvol_empty:
case Opt_subvolid:
- case Opt_subvolrootid:
case Opt_device:
/*
* These are parsed by btrfs_parse_subvol_options or
@@ -663,10 +730,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
goto out;
}
break;
- case Opt_alloc_start:
- btrfs_info(info,
- "option alloc_start is obsolete, ignored");
- break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
info->sb->s_flags |= SB_POSIXACL;
@@ -689,6 +752,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_norecovery:
case Opt_nologreplay:
+ btrfs_warn(info,
+ "'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
@@ -762,6 +827,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
break;
case Opt_inode_cache:
+ btrfs_warn(info,
+ "the 'inode_cache' option is deprecated and will have no effect from 5.11");
btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
"enabling inode map caching");
break;
@@ -791,10 +858,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
"disabling auto defrag");
break;
case Opt_recovery:
- btrfs_warn(info,
- "'recovery' is deprecated, use 'usebackuproot' instead");
- fallthrough;
case Opt_usebackuproot:
+ btrfs_warn(info,
+ "'%s' is deprecated, use 'rescue=usebackuproot' instead",
+ token == Opt_recovery ? "recovery" :
+ "usebackuproot");
btrfs_info(info,
"trying to use backup root at mount time");
btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
@@ -859,6 +927,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
info->commit_interval = intarg;
break;
+ case Opt_rescue:
+ ret = parse_rescue_options(info, args[0].from);
+ if (ret < 0)
+ goto out;
+ break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
btrfs_info(info, "fragmenting all space");
@@ -1020,9 +1093,6 @@ static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
*subvol_objectid = subvolid;
break;
- case Opt_subvolrootid:
- pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
- break;
default:
break;
}
@@ -1344,7 +1414,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
if (btrfs_test_opt(info, NOLOGREPLAY))
- seq_puts(seq, ",nologreplay");
+ seq_puts(seq, ",rescue=nologreplay");
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD_SYNC))
@@ -1712,11 +1782,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
new_pool_size);
}
-static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
-{
- set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
-}
-
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
unsigned long old_opts, int flags)
{
@@ -1750,8 +1815,6 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_cleanup(fs_info);
-
- clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1767,7 +1830,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
int ret;
sync_filesystem(sb);
- btrfs_remount_prepare(fs_info);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
if (data) {
void *new_sec_opts = NULL;
@@ -1889,6 +1952,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
out:
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
return 0;
restore:
@@ -1903,6 +1968,8 @@ restore:
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
btrfs_remount_cleanup(fs_info, old_opts);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
return ret;
}
@@ -2296,9 +2363,7 @@ static int btrfs_unfreeze(struct super_block *sb)
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
- struct btrfs_fs_devices *cur_devices;
struct btrfs_device *dev, *first_dev = NULL;
- struct list_head *head;
/*
* Lightweight locking of the devices. We should not need
@@ -2308,18 +2373,13 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
* least until the rcu_read_unlock.
*/
rcu_read_lock();
- cur_devices = fs_info->fs_devices;
- while (cur_devices) {
- head = &cur_devices->devices;
- list_for_each_entry_rcu(dev, head, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
- continue;
- if (!dev->name)
- continue;
- if (!first_dev || dev->devid < first_dev->devid)
- first_dev = dev;
- }
- cur_devices = cur_devices->seed;
+ list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (!dev->name)
+ continue;
+ if (!first_dev || dev->devid < first_dev->devid)
+ first_dev = dev;
}
if (first_dev)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a39bff64ff24..104c80caaa74 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -19,6 +19,7 @@
#include "volumes.h"
#include "space-info.h"
#include "block-group.h"
+#include "qgroup.h"
struct btrfs_feature_attr {
struct kobj_attribute kobj_attr;
@@ -936,8 +937,12 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+
btrfs_reset_fs_info_ptr(fs_info);
+ sysfs_remove_link(fsid_kobj, "bdi");
+
if (fs_info->space_info_kobj) {
sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
kobject_del(fs_info->space_info_kobj);
@@ -957,8 +962,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
}
#endif
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
- sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
+ sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(fsid_kobj, btrfs_attrs);
btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);
}
@@ -1273,7 +1278,9 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
{
int error = 0;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
+ nofs_flag = memalloc_nofs_save();
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
if (one_device && one_device != dev)
@@ -1301,6 +1308,7 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
break;
}
}
+ memalloc_nofs_restore(nofs_flag);
return error;
}
@@ -1438,6 +1446,10 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
if (error)
goto failure;
+ error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
+ if (error)
+ goto failure;
+
fs_info->space_info_kobj = kobject_create_and_add("allocation",
fsid_kobj);
if (!fs_info->space_info_kobj) {
@@ -1455,6 +1467,153 @@ failure:
return error;
}
+static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj)
+{
+ return to_fs_info(kobj->parent->parent);
+}
+
+#define QGROUP_ATTR(_member, _show_name) \
+static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member)
+
+#define QGROUP_RSV_ATTR(_name, _type) \
+static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->rsv.values[_type], \
+ &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name)
+
+QGROUP_ATTR(rfer, referenced);
+QGROUP_ATTR(excl, exclusive);
+QGROUP_ATTR(max_rfer, max_referenced);
+QGROUP_ATTR(max_excl, max_exclusive);
+QGROUP_ATTR(lim_flags, limit_flags);
+QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
+QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
+QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
+
+static struct attribute *qgroup_attrs[] = {
+ BTRFS_ATTR_PTR(qgroup, referenced),
+ BTRFS_ATTR_PTR(qgroup, exclusive),
+ BTRFS_ATTR_PTR(qgroup, max_referenced),
+ BTRFS_ATTR_PTR(qgroup, max_exclusive),
+ BTRFS_ATTR_PTR(qgroup, limit_flags),
+ BTRFS_ATTR_PTR(qgroup, rsv_data),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroup);
+
+static void qgroup_release(struct kobject *kobj)
+{
+ struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj);
+
+ memset(&qgroup->kobj, 0, sizeof(*kobj));
+}
+
+static struct kobj_type qgroup_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = qgroup_release,
+ .default_groups = qgroup_groups,
+};
+
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ struct kobject *qgroups_kobj = fs_info->qgroups_kobj;
+ int ret;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+ if (qgroup->kobj.state_initialized)
+ return 0;
+ if (!qgroups_kobj)
+ return -EINVAL;
+
+ ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
+ "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid));
+ if (ret < 0)
+ kobject_put(&qgroup->kobj);
+
+ return ret;
+}
+
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node)
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kobject_del(fs_info->qgroups_kobj);
+ kobject_put(fs_info->qgroups_kobj);
+ fs_info->qgroups_kobj = NULL;
+}
+
+/* Called when qgroups get initialized, thus there is no need for locking */
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+ int ret = 0;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+
+ ASSERT(fsid_kobj);
+ if (fs_info->qgroups_kobj)
+ return 0;
+
+ fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj);
+ if (!fs_info->qgroups_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node) {
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ if (ret < 0)
+ btrfs_sysfs_del_qgroups(fs_info);
+ return ret;
+}
+
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ if (qgroup->kobj.state_initialized) {
+ kobject_del(&qgroup->kobj);
+ kobject_put(&qgroup->kobj);
+ }
+}
/*
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 718a26c97833..cf839c46a131 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -36,4 +36,11 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
void btrfs_sysfs_update_devid(struct btrfs_device *device);
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
+
#endif
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 914eea5ba6a7..2c783d2f5228 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -60,8 +60,6 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
- if (i >= num_extents)
- goto invalid;
if (i >= num_extents ||
extent_start != extents[i].start ||
offset - extent_start != extents[i].length)
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 24a8c714f56c..894a63a92236 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -954,8 +954,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
- ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
- NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
+ BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -968,7 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE,
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
0, NULL);
if (ret) {
@@ -999,7 +999,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1)
+ sectorsize - 1,
0, NULL);
@@ -1017,7 +1017,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
0, NULL);
@@ -1035,7 +1035,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
@@ -1069,7 +1069,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
* Refill the hole again just for good measure, because I thought it
* might fail and I'd rather satisfy my paranoia at this point.
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b359d4b17658..20c6ac1a5de7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -937,7 +937,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (TRANS_ABORTED(trans) ||
test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
wake_up_process(info->transaction_kthread);
- err = -EIO;
+ if (TRANS_ABORTED(trans))
+ err = trans->aborted;
+ else
+ err = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -1630,7 +1633,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_get_fs_root(fs_info, objectid, true);
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
btrfs_abort_transaction(trans, ret);
@@ -2351,7 +2354,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
- clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index bf102e64bfb2..d60b055b8695 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -151,18 +151,20 @@ struct btrfs_pending_snapshot {
struct btrfs_block_rsv block_rsv;
/* extra metadata reservation for relocation */
int error;
+ /* Preallocated anonymous block device number */
+ dev_t anon_dev;
bool readonly;
struct list_head list;
};
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_trans = trans->transaction->transid;
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ inode->last_trans = trans->transaction->transid;
+ inode->last_sub_trans = inode->root->log_transid;
+ inode->last_log_commit = inode->root->last_log_commit;
+ spin_unlock(&inode->lock);
}
/*
@@ -208,20 +210,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
-
-/*
- * Try to commit transaction asynchronously, so this is safe to call
- * even holding a spinlock.
- *
- * It's done by informing transaction_kthread to commit transaction without
- * waiting for commit interval.
- */
-static inline void btrfs_commit_transaction_locksafe(
- struct btrfs_fs_info *fs_info)
-{
- set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
- wake_up_process(fs_info->transaction_kthread);
-}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 16c3a6d2586d..d3f28b8f4ff9 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -133,10 +133,9 @@ out:
ret = 0;
}
done:
- if (ret != -EAGAIN) {
+ if (ret != -EAGAIN)
memset(&root->defrag_progress, 0,
sizeof(root->defrag_progress));
- root->defrag_trans_start = trans->transid;
- }
+
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cd5348f352dd..ea8136dcf71f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3116,29 +3116,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
- atomic_inc(&log_root_tree->log_batch);
- atomic_inc(&log_root_tree->log_writers);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
- mutex_unlock(&log_root_tree->log_mutex);
-
- mutex_lock(&log_root_tree->log_mutex);
-
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
* open until we drop the log_mutex.
*/
ret = update_log_root(trans, log, &new_root_item);
-
- if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /* atomic_dec_and_test implies a barrier */
- cond_wake_up_nomb(&log_root_tree->log_writer_wait);
- }
-
if (ret) {
if (!list_empty(&root_log_ctx.list))
list_del_init(&root_log_ctx.list);
@@ -3184,8 +3172,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
root_log_ctx.log_transid - 1);
}
- wait_for_writer(log_root_tree);
-
/*
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
@@ -3906,6 +3892,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
}
static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
struct btrfs_root *log_root,
struct btrfs_ordered_sum *sums)
{
@@ -3914,6 +3901,14 @@ static int log_csums(struct btrfs_trans_handle *trans,
int ret;
/*
+ * If this inode was not used for reflink operations in the current
+ * transaction with new extents, then do the fast path, no need to
+ * worry about logging checksum items with overlapping ranges.
+ */
+ if (inode->last_reflink_trans < trans->transid)
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+
+ /*
* Serialize logging for checksums. This is to avoid racing with the
* same checksum being logged by another task that is logging another
* file which happens to refer to the same extent as well. Such races
@@ -4064,7 +4059,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = log_csums(trans, log, sums);
+ ret = log_csums(trans, inode, log, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4123,7 +4118,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = log_csums(trans, log_root, sums);
+ ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4151,7 +4146,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
em->start + em->len, NULL, 0, 1,
sizeof(*fi), &extent_inserted);
if (ret)
@@ -5123,14 +5118,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
const loff_t end,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
int err = 0;
- int ret;
+ int ret = 0;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
@@ -5166,15 +5160,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.offset = (u64)-1;
/*
- * Only run delayed items if we are a dir or a new file.
- * Otherwise commit the delayed inode only, which is needed in
- * order for the log replay code to mark inodes for link count
- * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ * Only run delayed items if we are a directory. We want to make sure
+ * all directory indexes hit the fs/subvolume tree so we can find them
+ * and figure out which index ranges have to be logged.
+ *
+ * Otherwise commit the delayed inode only if the full sync flag is set,
+ * as we want to make sure an up to date version is in the subvolume
+ * tree so copy_inode_items_to_log() / copy_items() can find it and copy
+ * it to the log tree. For a non full sync, we always log the inode item
+ * based on the in-memory struct btrfs_inode which is always up to date.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode) ||
- inode->generation > fs_info->last_trans_committed)
+ if (S_ISDIR(inode->vfs_inode.i_mode))
ret = btrfs_commit_inode_delayed_items(trans, inode);
- else
+ else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
ret = btrfs_commit_inode_delayed_inode(inode);
if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0d6e785bcb98..d7670e2a9f39 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -245,7 +245,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*
* global::fs_devs - add, remove, updates to the global list
*
- * does not protect: manipulation of the fs_devices::devices list!
+ * does not protect: manipulation of the fs_devices::devices list in general
+ * but in mount context it could be used to exclude list modifications by eg.
+ * scan ioctl
*
* btrfs_device::name - renames (write side), read is RCU
*
@@ -258,6 +260,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
+ * Is not required at mount and close times, because our device list is
+ * protected by the uuid_mutex at that point.
+ *
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
@@ -602,6 +607,11 @@ static int btrfs_free_stale_devices(const char *path,
return ret;
}
+/*
+ * This is only used on mount, and we are protected from competing things
+ * messing with our fs_devices by the uuid_mutex, thus we do not need the
+ * fs_devices->device_list_mutex here.
+ */
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
@@ -1229,8 +1239,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int ret;
lockdep_assert_held(&uuid_mutex);
+ /*
+ * The device_list_mutex cannot be taken here in case opening the
+ * underlying device takes further locks like bd_mutex.
+ *
+ * We also don't need the lock here as this is called during mount and
+ * exclusion is provided by uuid_mutex
+ */
- mutex_lock(&fs_devices->device_list_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
@@ -1238,7 +1254,6 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = open_fs_devices(fs_devices, flags, holder);
}
- mutex_unlock(&fs_devices->device_list_mutex);
return ret;
}
@@ -3231,7 +3246,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
@@ -4135,7 +4150,22 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
mutex_lock(&fs_info->balance_mutex);
if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
btrfs_info(fs_info, "balance: paused");
- else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
+ /*
+ * Balance can be canceled by:
+ *
+ * - Regular cancel request
+ * Then ret == -ECANCELED and balance_cancel_req > 0
+ *
+ * - Fatal signal to "btrfs" process
+ * Either the signal caught by wait_reserve_ticket() and callers
+ * got -EINTR, or caught by btrfs_should_cancel_balance() and
+ * got -ECANCELED.
+ * Either way, in this case balance_cancel_req = 0, and
+ * ret == -EINTR or ret == -ECANCELED.
+ *
+ * So here we only check the return value to catch canceled balance.
+ */
+ else if (ret == -ECANCELED || ret == -EINTR)
btrfs_info(fs_info, "balance: canceled");
else
btrfs_info(fs_info, "balance: ended with status: %d", ret);
@@ -5522,6 +5552,9 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
atomic_set(&bbio->error, 0);
refcount_set(&bbio->refs, 1);
+ bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
+ bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+
return bbio;
}
@@ -6144,8 +6177,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto out;
}
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
+
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ stripe_index++;
+ }
/* build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
@@ -6153,11 +6191,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 tmp;
unsigned rot;
- bbio->raid_map = (u64 *)((void *)bbio->stripes +
- sizeof(struct btrfs_bio_stripe) *
- num_alloc_stripes +
- sizeof(int) * tgtdev_indexes);
-
/* Work out the disk rotation on this stripe-set */
div_u64_rem(stripe_nr, num_stripes, &rot);
@@ -6171,25 +6204,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
bbio->raid_map[(i+rot+1) % num_stripes] =
RAID6_Q_STRIPE;
- }
-
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset +
- stripe_nr * map->stripe_len;
- bbio->stripes[i].dev =
- map->stripes[stripe_index].dev;
- stripe_index++;
+ sort_parity_stripes(bbio, num_stripes);
}
if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
- if (bbio->raid_map)
- sort_parity_stripes(bbio, num_stripes);
-
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
@@ -6261,23 +6282,18 @@ static void btrfs_end_bio(struct bio *bio)
atomic_inc(&bbio->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
- unsigned int stripe_index =
- btrfs_io_bio(bio)->stripe_index;
- struct btrfs_device *dev;
-
- BUG_ON(stripe_index >= bbio->num_stripes);
- dev = bbio->stripes[stripe_index].dev;
- if (dev->bdev) {
- if (bio_op(bio) == REQ_OP_WRITE)
- btrfs_dev_stat_inc_and_print(dev,
+ struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+
+ ASSERT(dev->bdev);
+ if (bio_op(bio) == REQ_OP_WRITE)
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev,
+ else if (!(bio->bi_opf & REQ_RAHEAD))
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
- if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev,
+ if (bio->bi_opf & REQ_PREFLUSH)
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
- }
}
}
@@ -6313,13 +6329,12 @@ static void btrfs_end_bio(struct bio *bio)
}
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
- u64 physical, int dev_nr)
+ u64 physical, struct btrfs_device *dev)
{
- struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
struct btrfs_fs_info *fs_info = bbio->fs_info;
bio->bi_private = bbio;
- btrfs_io_bio(bio)->stripe_index = dev_nr;
+ btrfs_io_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
btrfs_debug_in_rcu(fs_info,
@@ -6420,8 +6435,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
else
bio = first_bio;
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
- dev_nr);
+ submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
@@ -7029,6 +7043,19 @@ out:
return ret;
}
+static void readahead_tree_node_children(struct extent_buffer *node)
+{
+ int i;
+ const int nr_items = btrfs_header_nritems(node);
+
+ for (i = 0; i < nr_items; i++) {
+ u64 start;
+
+ start = btrfs_node_blockptr(node, i);
+ readahead_tree_block(node->fs_info, start);
+ }
+}
+
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->chunk_root;
@@ -7039,6 +7066,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
int ret;
int slot;
u64 total_dev = 0;
+ u64 last_ra_node = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7049,7 +7077,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* otherwise we don't need it.
*/
mutex_lock(&uuid_mutex);
- mutex_lock(&fs_info->chunk_mutex);
+
+ /*
+ * It is possible for mount and umount to race in such a way that
+ * we execute this code path, but open_fs_devices failed to clear
+ * total_rw_bytes. We certainly want it cleared before reading the
+ * device items, so clear it here.
+ */
+ fs_info->fs_devices->total_rw_bytes = 0;
/*
* Read all device items, and then all the chunk items. All
@@ -7064,6 +7099,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
if (ret < 0)
goto error;
while (1) {
+ struct extent_buffer *node;
+
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
@@ -7074,6 +7111,17 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
goto error;
break;
}
+ /*
+ * The nodes on level 1 are not locked but we don't need to do
+ * that during mount time as nothing else can access the tree
+ */
+ node = path->nodes[1];
+ if (node) {
+ if (last_ra_node != node->start) {
+ readahead_tree_node_children(node);
+ last_ra_node = node->start;
+ }
+ }
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
@@ -7086,7 +7134,9 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ mutex_lock(&fs_info->chunk_mutex);
ret = read_one_chunk(&found_key, leaf, chunk);
+ mutex_unlock(&fs_info->chunk_mutex);
if (ret)
goto error;
}
@@ -7116,7 +7166,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
}
ret = 0;
error:
- mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&uuid_mutex);
btrfs_free_path(path);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 75af2334b2e3..5eea93916fbf 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -288,7 +288,7 @@ struct btrfs_fs_devices {
*/
struct btrfs_io_bio {
unsigned int mirror_num;
- unsigned int stripe_index;
+ struct btrfs_device *device;
u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
diff --git a/fs/buffer.c b/fs/buffer.c
index 64fe82ec65ff..061dd202979d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -320,9 +320,8 @@ static void decrypt_bh(struct work_struct *work)
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
/* Decrypt if needed */
- if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) &&
- IS_ENCRYPTED(bh->b_page->mapping->host) &&
- S_ISREG(bh->b_page->mapping->host->i_mode)) {
+ if (uptodate &&
+ fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
if (ctx) {
@@ -3040,12 +3039,10 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
clear_buffer_write_io_error(bh);
- /*
- * from here on down, it's all bio -- do the initial mapping,
- * submit_bio -> generic_make_request may further map this bio around
- */
bio = bio_alloc(GFP_NOIO, 1);
+ fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
+
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 49c3ea8aa845..ce95801e9b66 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2044,7 +2044,6 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
FILE_UNIX_BASIC_INFO *info_buf_target;
unsigned int xid;
int rc, tmprc;
- bool new_target = d_really_is_negative(target_dentry);
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -2121,13 +2120,8 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
*/
unlink_target:
- /*
- * If the target dentry was created during the rename, try
- * unlinking it if it's not negative
- */
- if (new_target &&
- d_really_is_positive(target_dentry) &&
- (rc == -EACCES || rc == -EEXIST)) {
+ /* Try unlinking the target dentry if it's not negative */
+ if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
if (d_is_dir(target_dentry))
tmprc = cifs_rmdir(target_dir, target_dentry);
else
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 8046d7c7a3e9..a5f5c30368a2 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -4,6 +4,7 @@ config FS_ENCRYPTION
select CRYPTO
select CRYPTO_HASH
select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_SHA256
select KEYS
help
Enable encryption of files and directories. This
@@ -21,6 +22,11 @@ config FS_ENCRYPTION_ALGS
select CRYPTO_CTS
select CRYPTO_ECB
select CRYPTO_HMAC
- select CRYPTO_SHA256
select CRYPTO_SHA512
select CRYPTO_XTS
+
+config FS_ENCRYPTION_INLINE_CRYPT
+ bool "Enable fscrypt to use inline crypto"
+ depends on FS_ENCRYPTION && BLK_INLINE_ENCRYPTION
+ help
+ Enable fscrypt to use inline encryption hardware if available.
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index 232e2bb5a337..652c7180ec6d 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -11,3 +11,4 @@ fscrypto-y := crypto.o \
policy.o
fscrypto-$(CONFIG_BLOCK) += bio.o
+fscrypto-$(CONFIG_FS_ENCRYPTION_INLINE_CRYPT) += inline_crypt.o
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 4fa18fff9c4e..b048a0e38516 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -41,6 +41,53 @@ void fscrypt_decrypt_bio(struct bio *bio)
}
EXPORT_SYMBOL(fscrypt_decrypt_bio);
+static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
+ pgoff_t lblk, sector_t pblk,
+ unsigned int len)
+{
+ const unsigned int blockbits = inode->i_blkbits;
+ const unsigned int blocks_per_page = 1 << (PAGE_SHIFT - blockbits);
+ struct bio *bio;
+ int ret, err = 0;
+ int num_pages = 0;
+
+ /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
+ bio = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+
+ while (len) {
+ unsigned int blocks_this_page = min(len, blocks_per_page);
+ unsigned int bytes_this_page = blocks_this_page << blockbits;
+
+ if (num_pages == 0) {
+ fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS);
+ bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio->bi_iter.bi_sector =
+ pblk << (blockbits - SECTOR_SHIFT);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ }
+ ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
+ if (WARN_ON(ret != bytes_this_page)) {
+ err = -EIO;
+ goto out;
+ }
+ num_pages++;
+ len -= blocks_this_page;
+ lblk += blocks_this_page;
+ pblk += blocks_this_page;
+ if (num_pages == BIO_MAX_PAGES || !len ||
+ !fscrypt_mergeable_bio(bio, inode, lblk)) {
+ err = submit_bio_wait(bio);
+ if (err)
+ goto out;
+ bio_reset(bio);
+ num_pages = 0;
+ }
+ }
+out:
+ bio_put(bio);
+ return err;
+}
+
/**
* fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file
* @inode: the file's inode
@@ -75,6 +122,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
if (len == 0)
return 0;
+ if (fscrypt_inode_uses_inline_crypto(inode))
+ return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk,
+ len);
+
BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES);
nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
(len + blocks_per_page - 1) >> blocks_per_page_bits);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index ed015cb66c7c..9212325763b0 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -84,7 +84,7 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
WARN_ON_ONCE(lblk_num > U32_MAX);
lblk_num = (u32)(ci->ci_hashed_ino + lblk_num);
} else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
- memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE);
}
iv->lblk_num = cpu_to_le64(lblk_num);
}
@@ -100,7 +100,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
struct fscrypt_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
int res = 0;
if (WARN_ON_ONCE(len <= 0))
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 83ca5f1e7934..011830f84d8d 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -61,30 +61,13 @@ struct fscrypt_nokey_name {
*/
#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256)
-static struct crypto_shash *sha256_hash_tfm;
-
-static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
+static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
{
- struct crypto_shash *tfm = READ_ONCE(sha256_hash_tfm);
-
- if (unlikely(!tfm)) {
- struct crypto_shash *prev_tfm;
-
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- fscrypt_err(NULL,
- "Error allocating SHA-256 transform: %ld",
- PTR_ERR(tfm));
- return PTR_ERR(tfm);
- }
- prev_tfm = cmpxchg(&sha256_hash_tfm, NULL, tfm);
- if (prev_tfm) {
- crypto_free_shash(tfm);
- tfm = prev_tfm;
- }
- }
+ struct sha256_state sctx;
- return crypto_shash_tfm_digest(tfm, data, data_len, result);
+ sha256_init(&sctx);
+ sha256_update(&sctx, data, data_len);
+ sha256_final(&sctx, result);
}
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
@@ -115,7 +98,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
const struct fscrypt_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
struct scatterlist sg;
int res;
@@ -171,7 +154,7 @@ static int fname_decrypt(const struct inode *inode,
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
const struct fscrypt_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
int res;
@@ -349,7 +332,6 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
const struct qstr qname = FSTR_TO_QSTR(iname);
struct fscrypt_nokey_name nokey_name;
u32 size; /* size of the unencoded no-key name */
- int err;
if (fscrypt_is_dot_dotdot(&qname)) {
oname->name[0] = '.';
@@ -387,11 +369,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
} else {
memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes));
/* Compute strong hash of remaining part of name. */
- err = fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)],
- iname->len - sizeof(nokey_name.bytes),
- nokey_name.sha256);
- if (err)
- return err;
+ fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)],
+ iname->len - sizeof(nokey_name.bytes),
+ nokey_name.sha256);
size = FSCRYPT_NOKEY_NAME_MAX;
}
oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name);
@@ -530,9 +510,8 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
return false;
if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes)))
return false;
- if (fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)],
- de_name_len - sizeof(nokey_name->bytes), sha256))
- return false;
+ fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)],
+ de_name_len - sizeof(nokey_name->bytes), sha256);
return !memcmp(sha256, nokey_name->sha256, sizeof(sha256));
}
EXPORT_SYMBOL_GPL(fscrypt_match_name);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index eb7fcd2b7fb8..8117a61b6f55 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -14,12 +14,13 @@
#include <linux/fscrypt.h>
#include <linux/siphash.h>
#include <crypto/hash.h>
+#include <linux/blk-crypto.h>
#define CONST_STRLEN(str) (sizeof(str) - 1)
-#define FS_KEY_DERIVATION_NONCE_SIZE 16
+#define FSCRYPT_FILE_NONCE_SIZE 16
-#define FSCRYPT_MIN_KEY_SIZE 16
+#define FSCRYPT_MIN_KEY_SIZE 16
#define FSCRYPT_CONTEXT_V1 1
#define FSCRYPT_CONTEXT_V2 2
@@ -30,7 +31,7 @@ struct fscrypt_context_v1 {
u8 filenames_encryption_mode;
u8 flags;
u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
- u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};
struct fscrypt_context_v2 {
@@ -40,7 +41,7 @@ struct fscrypt_context_v2 {
u8 flags;
u8 __reserved[4];
u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
- u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};
/*
@@ -166,6 +167,20 @@ struct fscrypt_symlink_data {
char encrypted_path[1];
} __packed;
+/**
+ * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption
+ * @tfm: crypto API transform object
+ * @blk_key: key for blk-crypto
+ *
+ * Normally only one of the fields will be non-NULL.
+ */
+struct fscrypt_prepared_key {
+ struct crypto_skcipher *tfm;
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ struct fscrypt_blk_crypto_key *blk_key;
+#endif
+};
+
/*
* fscrypt_info - the "encryption key" for an inode
*
@@ -175,12 +190,20 @@ struct fscrypt_symlink_data {
*/
struct fscrypt_info {
- /* The actual crypto transform used for encryption and decryption */
- struct crypto_skcipher *ci_ctfm;
+ /* The key in a form prepared for actual encryption/decryption */
+ struct fscrypt_prepared_key ci_enc_key;
- /* True if the key should be freed when this fscrypt_info is freed */
+ /* True if ci_enc_key should be freed when this fscrypt_info is freed */
bool ci_owns_key;
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ /*
+ * True if this inode will use inline encryption (blk-crypto) instead of
+ * the traditional filesystem-layer encryption.
+ */
+ bool ci_inlinecrypt;
+#endif
+
/*
* Encryption mode used for this inode. It corresponds to either the
* contents or filenames encryption mode, depending on the inode type.
@@ -205,7 +228,7 @@ struct fscrypt_info {
/*
* If non-NULL, then encryption is done using the master key directly
- * and ci_ctfm will equal ci_direct_key->dk_ctfm.
+ * and ci_enc_key will equal ci_direct_key->dk_key.
*/
struct fscrypt_direct_key *ci_direct_key;
@@ -221,7 +244,7 @@ struct fscrypt_info {
union fscrypt_policy ci_policy;
/* This inode's nonce, copied from the fscrypt_context */
- u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE];
/* Hashed inode number. Only set for IV_INO_LBLK_32 */
u32 ci_hashed_ino;
@@ -257,9 +280,10 @@ union fscrypt_iv {
__le64 lblk_num;
/* per-file nonce; only set in DIRECT_KEY mode */
- u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};
u8 raw[FSCRYPT_MAX_IV_SIZE];
+ __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)];
};
void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
@@ -288,13 +312,13 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
* outputs are unique and cryptographically isolated, i.e. knowledge of one
* output doesn't reveal another.
*/
-#define HKDF_CONTEXT_KEY_IDENTIFIER 1
-#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2
-#define HKDF_CONTEXT_DIRECT_KEY 3
-#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4
-#define HKDF_CONTEXT_DIRHASH_KEY 5
-#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6
-#define HKDF_CONTEXT_INODE_HASH_KEY 7
+#define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */
+#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */
+#define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */
+#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */
+#define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */
+#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */
+#define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */
int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
const u8 *info, unsigned int infolen,
@@ -302,6 +326,78 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
+/* inline_crypt.c */
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+int fscrypt_select_encryption_impl(struct fscrypt_info *ci);
+
+static inline bool
+fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+{
+ return ci->ci_inlinecrypt;
+}
+
+int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key,
+ const struct fscrypt_info *ci);
+
+void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key);
+
+/*
+ * Check whether the crypto transform or blk-crypto key has been allocated in
+ * @prep_key, depending on which encryption implementation the file will use.
+ */
+static inline bool
+fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
+ const struct fscrypt_info *ci)
+{
+ /*
+ * The two smp_load_acquire()'s here pair with the smp_store_release()'s
+ * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key().
+ * I.e., in some cases (namely, if this prep_key is a per-mode
+ * encryption key) another task can publish blk_key or tfm concurrently,
+ * executing a RELEASE barrier. We need to use smp_load_acquire() here
+ * to safely ACQUIRE the memory the other task published.
+ */
+ if (fscrypt_using_inline_encryption(ci))
+ return smp_load_acquire(&prep_key->blk_key) != NULL;
+ return smp_load_acquire(&prep_key->tfm) != NULL;
+}
+
+#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
+
+static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+{
+ return 0;
+}
+
+static inline bool
+fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+{
+ return false;
+}
+
+static inline int
+fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key,
+ const struct fscrypt_info *ci)
+{
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+}
+
+static inline void
+fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+{
+}
+
+static inline bool
+fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
+ const struct fscrypt_info *ci)
+{
+ return smp_load_acquire(&prep_key->tfm) != NULL;
+}
+#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
+
/* keyring.c */
/*
@@ -395,9 +491,9 @@ struct fscrypt_master_key {
* Per-mode encryption keys for the various types of encryption policies
* that use them. Allocated and derived on-demand.
*/
- struct crypto_skcipher *mk_direct_keys[__FSCRYPT_MODE_MAX + 1];
- struct crypto_skcipher *mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1];
- struct crypto_skcipher *mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1];
+ struct fscrypt_prepared_key mk_direct_keys[__FSCRYPT_MODE_MAX + 1];
+ struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1];
+ struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1];
/* Hash key for inode numbers. Initialized only when needed. */
siphash_key_t mk_ino_hash_key;
@@ -461,13 +557,15 @@ struct fscrypt_mode {
int keysize;
int ivsize;
int logged_impl_name;
+ enum blk_crypto_mode_num blk_crypto_mode;
};
extern struct fscrypt_mode fscrypt_modes[];
-struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
- const u8 *raw_key,
- const struct inode *inode);
+int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key, const struct fscrypt_info *ci);
+
+void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key);
int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
new file mode 100644
index 000000000000..b6b8574caa13
--- /dev/null
+++ b/fs/crypto/inline_crypt.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Inline encryption support for fscrypt
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * With "inline encryption", the block layer handles the decryption/encryption
+ * as part of the bio, instead of the filesystem doing the crypto itself via
+ * crypto API. See Documentation/block/inline-encryption.rst. fscrypt still
+ * provides the key and IV to use.
+ */
+
+#include <linux/blk-crypto.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/sched/mm.h>
+
+#include "fscrypt_private.h"
+
+struct fscrypt_blk_crypto_key {
+ struct blk_crypto_key base;
+ int num_devs;
+ struct request_queue *devs[];
+};
+
+static int fscrypt_get_num_devices(struct super_block *sb)
+{
+ if (sb->s_cop->get_num_devices)
+ return sb->s_cop->get_num_devices(sb);
+ return 1;
+}
+
+static void fscrypt_get_devices(struct super_block *sb, int num_devs,
+ struct request_queue **devs)
+{
+ if (num_devs == 1)
+ devs[0] = bdev_get_queue(sb->s_bdev);
+ else
+ sb->s_cop->get_devices(sb, devs);
+}
+
+static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
+{
+ struct super_block *sb = ci->ci_inode->i_sb;
+ unsigned int flags = fscrypt_policy_flags(&ci->ci_policy);
+ int ino_bits = 64, lblk_bits = 64;
+
+ if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
+ return offsetofend(union fscrypt_iv, nonce);
+
+ if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64)
+ return sizeof(__le64);
+
+ if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)
+ return sizeof(__le32);
+
+ /* Default case: IVs are just the file logical block number */
+ if (sb->s_cop->get_ino_and_lblk_bits)
+ sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
+ return DIV_ROUND_UP(lblk_bits, 8);
+}
+
+/* Enable inline encryption for this file if supported. */
+int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+{
+ const struct inode *inode = ci->ci_inode;
+ struct super_block *sb = inode->i_sb;
+ struct blk_crypto_config crypto_cfg;
+ int num_devs;
+ struct request_queue **devs;
+ int i;
+
+ /* The file must need contents encryption, not filenames encryption */
+ if (!fscrypt_needs_contents_encryption(inode))
+ return 0;
+
+ /* The crypto mode must have a blk-crypto counterpart */
+ if (ci->ci_mode->blk_crypto_mode == BLK_ENCRYPTION_MODE_INVALID)
+ return 0;
+
+ /* The filesystem must be mounted with -o inlinecrypt */
+ if (!(sb->s_flags & SB_INLINECRYPT))
+ return 0;
+
+ /*
+ * When a page contains multiple logically contiguous filesystem blocks,
+ * some filesystem code only calls fscrypt_mergeable_bio() for the first
+ * block in the page. This is fine for most of fscrypt's IV generation
+ * strategies, where contiguous blocks imply contiguous IVs. But it
+ * doesn't work with IV_INO_LBLK_32. For now, simply exclude
+ * IV_INO_LBLK_32 with blocksize != PAGE_SIZE from inline encryption.
+ */
+ if ((fscrypt_policy_flags(&ci->ci_policy) &
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
+ sb->s_blocksize != PAGE_SIZE)
+ return 0;
+
+ /*
+ * On all the filesystem's devices, blk-crypto must support the crypto
+ * configuration that the file would use.
+ */
+ crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
+ crypto_cfg.data_unit_size = sb->s_blocksize;
+ crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
+ num_devs = fscrypt_get_num_devices(sb);
+ devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS);
+ if (!devs)
+ return -ENOMEM;
+ fscrypt_get_devices(sb, num_devs, devs);
+
+ for (i = 0; i < num_devs; i++) {
+ if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
+ goto out_free_devs;
+ }
+
+ ci->ci_inlinecrypt = true;
+out_free_devs:
+ kfree(devs);
+
+ return 0;
+}
+
+int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key,
+ const struct fscrypt_info *ci)
+{
+ const struct inode *inode = ci->ci_inode;
+ struct super_block *sb = inode->i_sb;
+ enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode;
+ int num_devs = fscrypt_get_num_devices(sb);
+ int queue_refs = 0;
+ struct fscrypt_blk_crypto_key *blk_key;
+ int err;
+ int i;
+ unsigned int flags;
+
+ blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS);
+ if (!blk_key)
+ return -ENOMEM;
+
+ blk_key->num_devs = num_devs;
+ fscrypt_get_devices(sb, num_devs, blk_key->devs);
+
+ err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode,
+ fscrypt_get_dun_bytes(ci), sb->s_blocksize);
+ if (err) {
+ fscrypt_err(inode, "error %d initializing blk-crypto key", err);
+ goto fail;
+ }
+
+ /*
+ * We have to start using blk-crypto on all the filesystem's devices.
+ * We also have to save all the request_queue's for later so that the
+ * key can be evicted from them. This is needed because some keys
+ * aren't destroyed until after the filesystem was already unmounted
+ * (namely, the per-mode keys in struct fscrypt_master_key).
+ */
+ for (i = 0; i < num_devs; i++) {
+ if (!blk_get_queue(blk_key->devs[i])) {
+ fscrypt_err(inode, "couldn't get request_queue");
+ err = -EAGAIN;
+ goto fail;
+ }
+ queue_refs++;
+
+ flags = memalloc_nofs_save();
+ err = blk_crypto_start_using_key(&blk_key->base,
+ blk_key->devs[i]);
+ memalloc_nofs_restore(flags);
+ if (err) {
+ fscrypt_err(inode,
+ "error %d starting to use blk-crypto", err);
+ goto fail;
+ }
+ }
+ /*
+ * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared().
+ * I.e., here we publish ->blk_key with a RELEASE barrier so that
+ * concurrent tasks can ACQUIRE it. Note that this concurrency is only
+ * possible for per-mode keys, not for per-file keys.
+ */
+ smp_store_release(&prep_key->blk_key, blk_key);
+ return 0;
+
+fail:
+ for (i = 0; i < queue_refs; i++)
+ blk_put_queue(blk_key->devs[i]);
+ kzfree(blk_key);
+ return err;
+}
+
+void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+{
+ struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key;
+ int i;
+
+ if (blk_key) {
+ for (i = 0; i < blk_key->num_devs; i++) {
+ blk_crypto_evict_key(blk_key->devs[i], &blk_key->base);
+ blk_put_queue(blk_key->devs[i]);
+ }
+ kzfree(blk_key);
+ }
+}
+
+bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
+{
+ return inode->i_crypt_info->ci_inlinecrypt;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
+
+static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
+ u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
+{
+ union fscrypt_iv iv;
+ int i;
+
+ fscrypt_generate_iv(&iv, lblk_num, ci);
+
+ BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE);
+ memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE);
+ for (i = 0; i < ci->ci_mode->ivsize/sizeof(dun[0]); i++)
+ dun[i] = le64_to_cpu(iv.dun[i]);
+}
+
+/**
+ * fscrypt_set_bio_crypt_ctx() - prepare a file contents bio for inline crypto
+ * @bio: a bio which will eventually be submitted to the file
+ * @inode: the file's inode
+ * @first_lblk: the first file logical block number in the I/O
+ * @gfp_mask: memory allocation flags - these must be a waiting mask so that
+ * bio_crypt_set_ctx can't fail.
+ *
+ * If the contents of the file should be encrypted (or decrypted) with inline
+ * encryption, then assign the appropriate encryption context to the bio.
+ *
+ * Normally the bio should be newly allocated (i.e. no pages added yet), as
+ * otherwise fscrypt_mergeable_bio() won't work as intended.
+ *
+ * The encryption context will be freed automatically when the bio is freed.
+ */
+void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
+ u64 first_lblk, gfp_t gfp_mask)
+{
+ const struct fscrypt_info *ci;
+ u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return;
+ ci = inode->i_crypt_info;
+
+ fscrypt_generate_dun(ci, first_lblk, dun);
+ bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx);
+
+/* Extract the inode and logical block number from a buffer_head. */
+static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh,
+ const struct inode **inode_ret,
+ u64 *lblk_num_ret)
+{
+ struct page *page = bh->b_page;
+ const struct address_space *mapping;
+ const struct inode *inode;
+
+ /*
+ * The ext4 journal (jbd2) can submit a buffer_head it directly created
+ * for a non-pagecache page. fscrypt doesn't care about these.
+ */
+ mapping = page_mapping(page);
+ if (!mapping)
+ return false;
+ inode = mapping->host;
+
+ *inode_ret = inode;
+ *lblk_num_ret = ((u64)page->index << (PAGE_SHIFT - inode->i_blkbits)) +
+ (bh_offset(bh) >> inode->i_blkbits);
+ return true;
+}
+
+/**
+ * fscrypt_set_bio_crypt_ctx_bh() - prepare a file contents bio for inline
+ * crypto
+ * @bio: a bio which will eventually be submitted to the file
+ * @first_bh: the first buffer_head for which I/O will be submitted
+ * @gfp_mask: memory allocation flags
+ *
+ * Same as fscrypt_set_bio_crypt_ctx(), except this takes a buffer_head instead
+ * of an inode and block number directly.
+ */
+void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio,
+ const struct buffer_head *first_bh,
+ gfp_t gfp_mask)
+{
+ const struct inode *inode;
+ u64 first_lblk;
+
+ if (bh_get_inode_and_lblk_num(first_bh, &inode, &first_lblk))
+ fscrypt_set_bio_crypt_ctx(bio, inode, first_lblk, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh);
+
+/**
+ * fscrypt_mergeable_bio() - test whether data can be added to a bio
+ * @bio: the bio being built up
+ * @inode: the inode for the next part of the I/O
+ * @next_lblk: the next file logical block number in the I/O
+ *
+ * When building a bio which may contain data which should undergo inline
+ * encryption (or decryption) via fscrypt, filesystems should call this function
+ * to ensure that the resulting bio contains only contiguous data unit numbers.
+ * This will return false if the next part of the I/O cannot be merged with the
+ * bio because either the encryption key would be different or the encryption
+ * data unit numbers would be discontiguous.
+ *
+ * fscrypt_set_bio_crypt_ctx() must have already been called on the bio.
+ *
+ * Return: true iff the I/O is mergeable
+ */
+bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
+ u64 next_lblk)
+{
+ const struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+ u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+
+ if (!!bc != fscrypt_inode_uses_inline_crypto(inode))
+ return false;
+ if (!bc)
+ return true;
+
+ /*
+ * Comparing the key pointers is good enough, as all I/O for each key
+ * uses the same pointer. I.e., there's currently no need to support
+ * merging requests where the keys are the same but the pointers differ.
+ */
+ if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base)
+ return false;
+
+ fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun);
+ return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun);
+}
+EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio);
+
+/**
+ * fscrypt_mergeable_bio_bh() - test whether data can be added to a bio
+ * @bio: the bio being built up
+ * @next_bh: the next buffer_head for which I/O will be submitted
+ *
+ * Same as fscrypt_mergeable_bio(), except this takes a buffer_head instead of
+ * an inode and block number directly.
+ *
+ * Return: true iff the I/O is mergeable
+ */
+bool fscrypt_mergeable_bio_bh(struct bio *bio,
+ const struct buffer_head *next_bh)
+{
+ const struct inode *inode;
+ u64 next_lblk;
+
+ if (!bh_get_inode_and_lblk_num(next_bh, &inode, &next_lblk))
+ return !bio->bi_crypt_context;
+
+ return fscrypt_mergeable_bio(bio, inode, next_lblk);
+}
+EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index e24eb48bfbe1..71d56f8e2870 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -45,9 +45,9 @@ static void free_master_key(struct fscrypt_master_key *mk)
wipe_master_key_secret(&mk->mk_secret);
for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) {
- crypto_free_skcipher(mk->mk_direct_keys[i]);
- crypto_free_skcipher(mk->mk_iv_ino_lblk_64_keys[i]);
- crypto_free_skcipher(mk->mk_iv_ino_lblk_32_keys[i]);
+ fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]);
+ fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]);
+ fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]);
}
key_put(mk->mk_users);
@@ -213,7 +213,11 @@ static int allocate_filesystem_keyring(struct super_block *sb)
if (IS_ERR(keyring))
return PTR_ERR(keyring);
- /* Pairs with READ_ONCE() in fscrypt_find_master_key() */
+ /*
+ * Pairs with the smp_load_acquire() in fscrypt_find_master_key().
+ * I.e., here we publish ->s_master_keys with a RELEASE barrier so that
+ * concurrent tasks can ACQUIRE it.
+ */
smp_store_release(&sb->s_master_keys, keyring);
return 0;
}
@@ -234,8 +238,13 @@ struct key *fscrypt_find_master_key(struct super_block *sb,
struct key *keyring;
char description[FSCRYPT_MK_DESCRIPTION_SIZE];
- /* pairs with smp_store_release() in allocate_filesystem_keyring() */
- keyring = READ_ONCE(sb->s_master_keys);
+ /*
+ * Pairs with the smp_store_release() in allocate_filesystem_keyring().
+ * I.e., another task can publish ->s_master_keys concurrently,
+ * executing a RELEASE barrier. We need to use smp_load_acquire() here
+ * to safely ACQUIRE the memory the other task published.
+ */
+ keyring = smp_load_acquire(&sb->s_master_keys);
if (keyring == NULL)
return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 1129adfa097d..fea6226afc2b 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.cipher_str = "xts(aes)",
.keysize = 64,
.ivsize = 16,
+ .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
},
[FSCRYPT_MODE_AES_256_CTS] = {
.friendly_name = "AES-256-CTS-CBC",
@@ -31,6 +32,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
.ivsize = 16,
+ .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
},
[FSCRYPT_MODE_AES_128_CTS] = {
.friendly_name = "AES-128-CTS-CBC",
@@ -43,6 +45,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
.ivsize = 32,
+ .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
},
};
@@ -64,9 +67,9 @@ select_encryption_mode(const union fscrypt_policy *policy,
}
/* Create a symmetric cipher object for the given encryption mode and key */
-struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
- const u8 *raw_key,
- const struct inode *inode)
+static struct crypto_skcipher *
+fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
+ const struct inode *inode)
{
struct crypto_skcipher *tfm;
int err;
@@ -109,30 +112,56 @@ err_free_tfm:
return ERR_PTR(err);
}
-/* Given a per-file encryption key, set up the file's crypto transform object */
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
+/*
+ * Prepare the crypto transform object or blk-crypto key in @prep_key, given the
+ * raw key, encryption mode, and flag indicating which encryption implementation
+ * (fs-layer or blk-crypto) will be used.
+ */
+int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
+ const u8 *raw_key, const struct fscrypt_info *ci)
{
struct crypto_skcipher *tfm;
+ if (fscrypt_using_inline_encryption(ci))
+ return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci);
+
tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode);
if (IS_ERR(tfm))
return PTR_ERR(tfm);
+ /*
+ * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared().
+ * I.e., here we publish ->tfm with a RELEASE barrier so that
+ * concurrent tasks can ACQUIRE it. Note that this concurrency is only
+ * possible for per-mode keys, not for per-file keys.
+ */
+ smp_store_release(&prep_key->tfm, tfm);
+ return 0;
+}
- ci->ci_ctfm = tfm;
+/* Destroy a crypto transform object and/or blk-crypto key. */
+void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key)
+{
+ crypto_free_skcipher(prep_key->tfm);
+ fscrypt_destroy_inline_crypt_key(prep_key);
+}
+
+/* Given a per-file encryption key, set up the file's crypto transform object */
+int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
+{
ci->ci_owns_key = true;
- return 0;
+ return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci);
}
static int setup_per_mode_enc_key(struct fscrypt_info *ci,
struct fscrypt_master_key *mk,
- struct crypto_skcipher **tfms,
+ struct fscrypt_prepared_key *keys,
u8 hkdf_context, bool include_fs_uuid)
{
const struct inode *inode = ci->ci_inode;
const struct super_block *sb = inode->i_sb;
struct fscrypt_mode *mode = ci->ci_mode;
const u8 mode_num = mode - fscrypt_modes;
- struct crypto_skcipher *tfm;
+ struct fscrypt_prepared_key *prep_key;
u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)];
unsigned int hkdf_infolen = 0;
@@ -141,16 +170,15 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX))
return -EINVAL;
- /* pairs with smp_store_release() below */
- tfm = READ_ONCE(tfms[mode_num]);
- if (likely(tfm != NULL)) {
- ci->ci_ctfm = tfm;
+ prep_key = &keys[mode_num];
+ if (fscrypt_is_key_prepared(prep_key, ci)) {
+ ci->ci_enc_key = *prep_key;
return 0;
}
mutex_lock(&fscrypt_mode_key_setup_mutex);
- if (tfms[mode_num])
+ if (fscrypt_is_key_prepared(prep_key, ci))
goto done_unlock;
BUILD_BUG_ON(sizeof(mode_num) != 1);
@@ -167,16 +195,12 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
mode_key, mode->keysize);
if (err)
goto out_unlock;
- tfm = fscrypt_allocate_skcipher(mode, mode_key, inode);
+ err = fscrypt_prepare_key(prep_key, mode_key, ci);
memzero_explicit(mode_key, mode->keysize);
- if (IS_ERR(tfm)) {
- err = PTR_ERR(tfm);
+ if (err)
goto out_unlock;
- }
- /* pairs with READ_ONCE() above */
- smp_store_release(&tfms[mode_num], tfm);
done_unlock:
- ci->ci_ctfm = tfm;
+ ci->ci_enc_key = *prep_key;
err = 0;
out_unlock:
mutex_unlock(&fscrypt_mode_key_setup_mutex);
@@ -189,7 +213,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
int err;
err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY,
- ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE,
+ ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
(u8 *)&ci->ci_dirhash_key,
sizeof(ci->ci_dirhash_key));
if (err)
@@ -270,8 +294,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
HKDF_CONTEXT_PER_FILE_ENC_KEY,
- ci->ci_nonce,
- FS_KEY_DERIVATION_NONCE_SIZE,
+ ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
derived_key, ci->ci_mode->keysize);
if (err)
return err;
@@ -310,6 +333,10 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
struct fscrypt_key_specifier mk_spec;
int err;
+ err = fscrypt_select_encryption_impl(ci);
+ if (err)
+ return err;
+
switch (ci->ci_policy.version) {
case FSCRYPT_POLICY_V1:
mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
@@ -402,7 +429,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
if (ci->ci_direct_key)
fscrypt_put_direct_key(ci->ci_direct_key);
else if (ci->ci_owns_key)
- crypto_free_skcipher(ci->ci_ctfm);
+ fscrypt_destroy_prepared_key(&ci->ci_enc_key);
key = ci->ci_master_key;
if (key) {
@@ -472,7 +499,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
}
memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx),
- FS_KEY_DERIVATION_NONCE_SIZE);
+ FSCRYPT_FILE_NONCE_SIZE);
if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
res = -EINVAL;
@@ -491,7 +518,17 @@ int fscrypt_get_encryption_info(struct inode *inode)
if (res)
goto out;
+ /*
+ * Multiple tasks may race to set ->i_crypt_info, so use
+ * cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a
+ * RELEASE barrier so that other tasks can ACQUIRE it.
+ */
if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
+ /*
+ * We won the race and set ->i_crypt_info to our crypt_info.
+ * Now link it into the master key's inode list.
+ */
if (master_key) {
struct fscrypt_master_key *mk =
master_key->payload.data[0];
@@ -562,7 +599,7 @@ EXPORT_SYMBOL(fscrypt_free_inode);
*/
int fscrypt_drop_inode(struct inode *inode)
{
- const struct fscrypt_info *ci = READ_ONCE(inode->i_crypt_info);
+ const struct fscrypt_info *ci = fscrypt_get_info(inode);
const struct fscrypt_master_key *mk;
/*
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 801b48c0cd7f..e4e707fb1100 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -45,7 +45,7 @@ static DEFINE_SPINLOCK(fscrypt_direct_keys_lock);
* key is longer, then only the first 'derived_keysize' bytes are used.
*/
static int derive_key_aes(const u8 *master_key,
- const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE],
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
u8 *derived_key, unsigned int derived_keysize)
{
int res = 0;
@@ -68,7 +68,7 @@ static int derive_key_aes(const u8 *master_key,
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
crypto_req_done, &wait);
- res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ res = crypto_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE);
if (res < 0)
goto out;
@@ -146,7 +146,7 @@ struct fscrypt_direct_key {
struct hlist_node dk_node;
refcount_t dk_refcount;
const struct fscrypt_mode *dk_mode;
- struct crypto_skcipher *dk_ctfm;
+ struct fscrypt_prepared_key dk_key;
u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
u8 dk_raw[FSCRYPT_MAX_KEY_SIZE];
};
@@ -154,7 +154,7 @@ struct fscrypt_direct_key {
static void free_direct_key(struct fscrypt_direct_key *dk)
{
if (dk) {
- crypto_free_skcipher(dk->dk_ctfm);
+ fscrypt_destroy_prepared_key(&dk->dk_key);
kzfree(dk);
}
}
@@ -199,6 +199,8 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
continue;
if (ci->ci_mode != dk->dk_mode)
continue;
+ if (!fscrypt_is_key_prepared(&dk->dk_key, ci))
+ continue;
if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize))
continue;
/* using existing tfm with same (descriptor, mode, raw_key) */
@@ -231,13 +233,9 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
return ERR_PTR(-ENOMEM);
refcount_set(&dk->dk_refcount, 1);
dk->dk_mode = ci->ci_mode;
- dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key,
- ci->ci_inode);
- if (IS_ERR(dk->dk_ctfm)) {
- err = PTR_ERR(dk->dk_ctfm);
- dk->dk_ctfm = NULL;
+ err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci);
+ if (err)
goto err_free_dk;
- }
memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize);
@@ -259,7 +257,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci,
if (IS_ERR(dk))
return PTR_ERR(dk);
ci->ci_direct_key = dk;
- ci->ci_ctfm = dk->dk_ctfm;
+ ci->ci_enc_key = dk->dk_key;
return 0;
}
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index d23ff162c78b..2d73fd39ad96 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -78,6 +78,20 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
int ino_bits = 64, lblk_bits = 64;
/*
+ * IV_INO_LBLK_* exist only because of hardware limitations, and
+ * currently the only known use case for them involves AES-256-XTS.
+ * That's also all we test currently. For these reasons, for now only
+ * allow AES-256-XTS here. This can be relaxed later if a use case for
+ * IV_INO_LBLK_* with other encryption modes arises.
+ */
+ if (policy->contents_encryption_mode != FSCRYPT_MODE_AES_256_XTS) {
+ fscrypt_warn(inode,
+ "Can't use %s policy with contents mode other than AES-256-XTS",
+ type);
+ return false;
+ }
+
+ /*
* It's unsafe to include inode numbers in the IVs if the filesystem can
* potentially renumber inodes, e.g. via filesystem shrinking.
*/
@@ -338,7 +352,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
union fscrypt_context ctx;
int ret;
- ci = READ_ONCE(inode->i_crypt_info);
+ ci = fscrypt_get_info(inode);
if (ci) {
/* key available, use the cached policy */
*policy = ci->ci_policy;
@@ -529,7 +543,7 @@ int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
if (!fscrypt_context_is_valid(&ctx, ret))
return -EINVAL;
if (copy_to_user(arg, fscrypt_context_nonce(&ctx),
- FS_KEY_DERIVATION_NONCE_SIZE))
+ FSCRYPT_FILE_NONCE_SIZE))
return -EFAULT;
return 0;
}
@@ -627,7 +641,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
if (res < 0)
return res;
- ci = READ_ONCE(parent->i_crypt_info);
+ ci = fscrypt_get_info(parent);
if (ci == NULL)
return -ENOKEY;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 6d5370eac2a8..183299892465 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1387,8 +1387,8 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
* Attempt to prefetch the pieces we likely need later.
*/
prefetch(&bdev->bd_disk->part_tbl);
- prefetch(bdev->bd_queue);
- prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+ prefetch(bdev->bd_disk->queue);
+ prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES);
return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
end_io, submit_io, flags);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 12c66f5d92dd..28bb5689333a 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -201,6 +201,9 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_d_op = &efivarfs_d_ops;
sb->s_time_gran = 1;
+ if (!efivar_supports_writes())
+ sb->s_flags |= SB_RDONLY;
+
inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true);
if (!inode)
return -ENOMEM;
@@ -252,9 +255,6 @@ static struct file_system_type efivarfs_type = {
static __init int efivarfs_init(void)
{
- if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES))
- return -ENODEV;
-
if (!efivars_kobject())
return -ENODEV;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 4a6ebff2af76..a4a945d0ac6a 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/vfs.h>
+#include <linux/blkdev.h>
#include "efs.h"
#include <linux/efs_vh.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 10dd470876b3..44bad4bb8831 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1096,7 +1096,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
}
if (unlikely(err)) {
page_zero_new_buffers(page, from, to);
- } else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
+ } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) {
int err2;
@@ -3737,7 +3737,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
- if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index de6fe969f773..defd2e10dfd1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -402,6 +402,7 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
@@ -418,7 +419,8 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
{
int ret;
- if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+ if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
+ !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
submit_and_retry:
ext4_io_submit(io);
}
@@ -506,7 +508,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* (e.g. holes) to be unnecessarily encrypted, but this is rare and
* can't happen in the common case of blocksize == PAGE_SIZE.
*/
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
unsigned int enc_bytes = round_up(len, i_blocksize(inode));
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5761e9961682..f2df2db0786c 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -195,7 +195,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio,
{
unsigned int post_read_steps = 0;
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= 1 << STEP_DECRYPT;
if (ext4_need_verity(inode, first_idx))
@@ -230,6 +230,7 @@ int ext4_mpage_readpages(struct inode *inode,
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
+ sector_t next_block;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
@@ -258,7 +259,8 @@ int ext4_mpage_readpages(struct inode *inode,
if (page_has_buffers(page))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
+ block_in_file = next_block =
+ (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (ext4_readpage_limit(inode) +
blocksize - 1) >> blkbits;
@@ -358,7 +360,8 @@ int ext4_mpage_readpages(struct inode *inode,
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != blocks[0] - 1)) {
+ if (bio && (last_block_in_bio != blocks[0] - 1 ||
+ !fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc:
submit_bio(bio);
bio = NULL;
@@ -370,6 +373,8 @@ int ext4_mpage_readpages(struct inode *inode,
*/
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
+ fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
+ GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 330957ed1f05..0907f907c47d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1508,6 +1508,7 @@ enum {
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
+ Opt_inlinecrypt,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
@@ -1610,6 +1611,7 @@ static const match_table_t tokens = {
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_inlinecrypt, "inlinecrypt"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
@@ -1946,6 +1948,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
case Opt_nolazytime:
sb->s_flags &= ~SB_LAZYTIME;
return 1;
+ case Opt_inlinecrypt:
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ sb->s_flags |= SB_INLINECRYPT;
+#else
+ ext4_msg(sb, KERN_ERR, "inline encryption not supported");
+#endif
+ return 1;
}
for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -2404,6 +2413,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
fscrypt_show_test_dummy_encryption(seq, sep, sb);
+ if (sb->s_flags & SB_INLINECRYPT)
+ SEQ_OPTS_PUTS("inlinecrypt");
+
if (test_opt(sb, DAX_ALWAYS)) {
if (IS_EXT2_SB(sb))
SEQ_OPTS_PUTS("dax");
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 1e02a8c106b0..29e50fbe7eca 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1086,7 +1086,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
.submitted = false,
.io_type = io_type,
.io_wbc = wbc,
- .encrypted = f2fs_encrypted_file(cc->inode),
+ .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode),
};
struct dnode_of_data dn;
struct node_info ni;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 326c63879ddc..b9642607c07d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -14,6 +14,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
+#include <linux/blk-crypto.h>
#include <linux/swap.h>
#include <linux/prefetch.h>
#include <linux/uio.h>
@@ -459,6 +460,33 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
return bio;
}
+static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
+ pgoff_t first_idx,
+ const struct f2fs_io_info *fio,
+ gfp_t gfp_mask)
+{
+ /*
+ * The f2fs garbage collector sets ->encrypted_page when it wants to
+ * read/write raw data without encryption.
+ */
+ if (!fio || !fio->encrypted_page)
+ fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask);
+}
+
+static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
+ pgoff_t next_idx,
+ const struct f2fs_io_info *fio)
+{
+ /*
+ * The f2fs garbage collector sets ->encrypted_page when it wants to
+ * read/write raw data without encryption.
+ */
+ if (fio && fio->encrypted_page)
+ return !bio_has_crypt_ctx(bio);
+
+ return fscrypt_mergeable_bio(bio, inode, next_idx);
+}
+
static inline void __submit_bio(struct f2fs_sb_info *sbi,
struct bio *bio, enum page_type type)
{
@@ -684,6 +712,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
/* Allocate a new bio */
bio = __bio_alloc(fio, 1);
+ f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
+ fio->page->index, fio, GFP_NOIO);
+
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
return -EFAULT;
@@ -763,9 +794,10 @@ static void del_bio_entry(struct bio_entry *be)
kmem_cache_free(bio_entry_slab, be);
}
-static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio,
+static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
struct page *page)
{
+ struct f2fs_sb_info *sbi = fio->sbi;
enum temp_type temp;
bool found = false;
int ret = -EAGAIN;
@@ -782,13 +814,19 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio,
found = true;
- if (bio_add_page(*bio, page, PAGE_SIZE, 0) ==
- PAGE_SIZE) {
+ f2fs_bug_on(sbi, !page_is_mergeable(sbi, *bio,
+ *fio->last_block,
+ fio->new_blkaddr));
+ if (f2fs_crypt_mergeable_bio(*bio,
+ fio->page->mapping->host,
+ fio->page->index, fio) &&
+ bio_add_page(*bio, page, PAGE_SIZE, 0) ==
+ PAGE_SIZE) {
ret = 0;
break;
}
- /* bio is full */
+ /* page can't be merged into bio; submit the bio */
del_bio_entry(be);
__submit_bio(sbi, *bio, DATA);
break;
@@ -880,11 +918,13 @@ alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_PAGES);
__attach_io_flag(fio);
+ f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
+ fio->page->index, fio, GFP_NOIO);
bio_set_op_attrs(bio, fio->op, fio->op_flags);
add_bio_entry(fio->sbi, bio, page, fio->temp);
} else {
- if (add_ipu_page(fio->sbi, &bio, page))
+ if (add_ipu_page(fio, &bio, page))
goto alloc_new;
}
@@ -936,8 +976,11 @@ next:
inc_page_count(sbi, WB_DATA_TYPE(bio_page));
- if (io->bio && !io_is_mergeable(sbi, io->bio, io, fio,
- io->last_block_in_bio, fio->new_blkaddr))
+ if (io->bio &&
+ (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
+ fio->new_blkaddr) ||
+ !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host,
+ bio_page->index, fio)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
@@ -949,6 +992,8 @@ alloc_new:
goto skip;
}
io->bio = __bio_alloc(fio, BIO_MAX_PAGES);
+ f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
+ bio_page->index, fio, GFP_NOIO);
io->fio = *fio;
}
@@ -993,11 +1038,14 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
for_write);
if (!bio)
return ERR_PTR(-ENOMEM);
+
+ f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS);
+
f2fs_target_device(sbi, blkaddr, bio);
bio->bi_end_io = f2fs_read_end_io;
bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
- if (f2fs_encrypted_file(inode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= 1 << STEP_DECRYPT;
if (f2fs_compressed_file(inode))
post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ;
@@ -2073,8 +2121,9 @@ zero_out:
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && !page_is_mergeable(F2FS_I_SB(inode), bio,
- *last_block_in_bio, block_nr)) {
+ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+ *last_block_in_bio, block_nr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
submit_and_realloc:
__submit_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
@@ -2204,8 +2253,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i + 1);
- if (bio && !page_is_mergeable(sbi, bio,
- *last_block_in_bio, blkaddr)) {
+ if (bio && (!page_is_mergeable(sbi, bio,
+ *last_block_in_bio, blkaddr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
submit_and_realloc:
__submit_bio(sbi, bio, DATA);
bio = NULL;
@@ -2421,6 +2471,9 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
+ if (fscrypt_inode_uses_inline_crypto(inode))
+ return 0;
+
retry_encrypt:
fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page,
PAGE_SIZE, 0, gfp_flags);
@@ -2594,7 +2647,7 @@ got_it:
f2fs_unlock_op(fio->sbi);
err = f2fs_inplace_write_data(fio);
if (err) {
- if (f2fs_encrypted_file(inode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
if (PageWriteback(page))
end_page_writeback(page);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 20e56b0fa46a..23c49c313fb6 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -138,6 +138,7 @@ enum {
Opt_alloc,
Opt_fsync,
Opt_test_dummy_encryption,
+ Opt_inlinecrypt,
Opt_checkpoint_disable,
Opt_checkpoint_disable_cap,
Opt_checkpoint_disable_cap_perc,
@@ -204,6 +205,7 @@ static match_table_t f2fs_tokens = {
{Opt_fsync, "fsync_mode=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_inlinecrypt, "inlinecrypt"},
{Opt_checkpoint_disable, "checkpoint=disable"},
{Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
{Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
@@ -833,6 +835,13 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
if (ret)
return ret;
break;
+ case Opt_inlinecrypt:
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ sb->s_flags |= SB_INLINECRYPT;
+#else
+ f2fs_info(sbi, "inline encryption not supported");
+#endif
+ break;
case Opt_checkpoint_disable_cap_perc:
if (args->from && match_int(args, &arg))
return -EINVAL;
@@ -1590,6 +1599,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb);
+ if (sbi->sb->s_flags & SB_INLINECRYPT)
+ seq_puts(seq, ",inlinecrypt");
+
if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT)
seq_printf(seq, ",alloc_mode=%s", "default");
else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
@@ -1624,6 +1636,8 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).compress_ext_cnt = 0;
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
+ sbi->sb->s_flags &= ~SB_INLINECRYPT;
+
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
@@ -2470,6 +2484,25 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
*lblk_bits_ret = 8 * sizeof(block_t);
}
+static int f2fs_get_num_devices(struct super_block *sb)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (f2fs_is_multi_device(sbi))
+ return sbi->s_ndevs;
+ return 1;
+}
+
+static void f2fs_get_devices(struct super_block *sb,
+ struct request_queue **devs)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ devs[i] = bdev_get_queue(FDEV(i).bdev);
+}
+
static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
@@ -2479,6 +2512,8 @@ static const struct fscrypt_operations f2fs_cryptops = {
.max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
.get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
+ .get_num_devices = f2fs_get_num_devices,
+ .get_devices = f2fs_get_devices,
};
#endif
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2f224b98ee94..f35a37c65e5f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -17,6 +17,7 @@
#include <linux/cred.h>
#include <linux/uio.h>
#include <linux/xattr.h>
+#include <linux/blkdev.h>
#include "hfs_fs.h"
#include "btree.h"
diff --git a/fs/internal.h b/fs/internal.h
index 9b863a7bd708..969988d3d397 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,7 +23,9 @@ struct user_namespace;
extern void __init bdev_cache_init(void);
extern int __sync_blockdev(struct block_device *bdev, int wait);
-
+void iterate_bdevs(void (*)(struct block_device *, void *), void *);
+void emergency_thaw_bdev(struct super_block *sb);
+void bd_forget(struct inode *inode);
#else
static inline void bdev_cache_init(void)
{
@@ -33,7 +35,18 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
{
return 0;
}
-#endif
+static inline void iterate_bdevs(void (*f)(struct block_device *, void *),
+ void *arg)
+{
+}
+static inline int emergency_thaw_bdev(struct super_block *sb)
+{
+ return 0;
+}
+static inline void bd_forget(struct inode *inode)
+{
+}
+#endif /* CONFIG_BLOCK */
/*
* buffer.c
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 47c5f3aeb460..e92c4724480c 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -462,6 +462,7 @@ static void io_impersonate_work(struct io_worker *worker,
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
}
static void io_assign_current_work(struct io_worker *worker,
@@ -489,7 +490,6 @@ static void io_worker_handle_work(struct io_worker *worker)
do {
struct io_wq_work *work;
- unsigned int hash;
get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
@@ -512,6 +512,7 @@ get_next:
/* handle a whole dependent link */
do {
struct io_wq_work *old_work, *next_hashed, *linked;
+ unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work);
io_impersonate_work(worker, work);
@@ -522,10 +523,8 @@ get_next:
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
- hash = io_get_work_hash(work);
- linked = old_work = work;
- wq->do_work(&linked);
- linked = (old_work == linked) ? NULL : linked;
+ old_work = work;
+ linked = wq->do_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
@@ -542,8 +541,6 @@ get_next:
spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- /* dependent work is not hashed */
- hash = -1U;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
@@ -781,8 +778,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
- wq->do_work(&work);
- work = (work == old_work) ? NULL : work;
+ work = wq->do_work(work);
wq->free_work(old_work);
} while (work);
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 071f1a997800..ddaf9614cf9b 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -5,10 +5,10 @@ struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
- IO_WQ_WORK_HASHED = 4,
- IO_WQ_WORK_UNBOUND = 32,
- IO_WQ_WORK_NO_CANCEL = 256,
- IO_WQ_WORK_CONCURRENT = 512,
+ IO_WQ_WORK_HASHED = 2,
+ IO_WQ_WORK_UNBOUND = 4,
+ IO_WQ_WORK_NO_CANCEL = 8,
+ IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -89,6 +89,7 @@ struct io_wq_work {
struct mm_struct *mm;
const struct cred *creds;
struct fs_struct *fs;
+ unsigned long fsize;
unsigned flags;
};
@@ -101,7 +102,7 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
}
typedef void (free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work **);
+typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 74bc4a04befa..2a3af95be4ca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,6 +78,7 @@
#include <linux/fs_struct.h>
#include <linux/splice.h>
#include <linux/task_work.h>
+#include <linux/pagemap.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -226,7 +227,7 @@ struct io_ring_ctx {
struct {
unsigned int flags;
unsigned int compat: 1;
- unsigned int account_mem: 1;
+ unsigned int limit_mem: 1;
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
@@ -319,12 +320,12 @@ struct io_ring_ctx {
spinlock_t completion_lock;
/*
- * ->poll_list is protected by the ctx->uring_lock for
+ * ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head poll_list;
+ struct list_head iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_file;
@@ -395,6 +396,7 @@ struct io_timeout {
int flags;
u32 off;
u32 target_seq;
+ struct list_head list;
};
struct io_rw {
@@ -413,7 +415,7 @@ struct io_connect {
struct io_sr_msg {
struct file *file;
union {
- struct user_msghdr __user *msg;
+ struct user_msghdr __user *umsg;
void __user *buf;
};
int msg_flags;
@@ -486,6 +488,12 @@ struct io_statx {
struct statx __user *buffer;
};
+struct io_completion {
+ struct file *file;
+ struct list_head list;
+ int cflags;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -503,6 +511,7 @@ struct io_async_rw {
struct iovec *iov;
ssize_t nr_segs;
ssize_t size;
+ struct wait_page_queue wpq;
};
struct io_async_ctx {
@@ -523,23 +532,18 @@ enum {
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_LINK_HEAD_BIT,
- REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
- REQ_F_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
- REQ_F_MUST_PUNT_BIT,
- REQ_F_TIMEOUT_NOSEQ_BIT,
REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_OVERFLOW_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
- REQ_F_QUEUE_TIMEOUT_BIT,
REQ_F_WORK_INITIALIZED_BIT,
REQ_F_TASK_PINNED_BIT,
@@ -563,8 +567,6 @@ enum {
/* head of a link */
REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
- /* already grabbed next link */
- REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
@@ -575,14 +577,8 @@ enum {
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
- /* timeout request */
- REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
- /* must be punted even for NONBLOCK */
- REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
- /* no timeout sequence */
- REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
/* completion under lock */
REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
/* needs cleanup */
@@ -595,8 +591,6 @@ enum {
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
- /* needs to queue linked timeout */
- REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
/* req->task is refcounted */
@@ -605,7 +599,7 @@ enum {
struct async_poll {
struct io_poll_iocb poll;
- struct io_wq_work work;
+ struct io_poll_iocb *double_poll;
};
/*
@@ -634,51 +628,54 @@ struct io_kiocb {
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ /* use only after cleaning per-op data, see io_clean_op() */
+ struct io_completion compl;
};
struct io_async_ctx *io;
- int cflags;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
u16 buf_index;
+ u32 result;
- struct io_ring_ctx *ctx;
- struct list_head list;
- unsigned int flags;
- refcount_t refs;
- struct task_struct *task;
- unsigned long fsize;
- u64 user_data;
- u32 result;
- u32 sequence;
-
- struct list_head link_list;
+ struct io_ring_ctx *ctx;
+ unsigned int flags;
+ refcount_t refs;
+ struct task_struct *task;
+ u64 user_data;
- struct list_head inflight_entry;
+ struct list_head link_list;
- struct percpu_ref *fixed_file_refs;
+ /*
+ * 1. used with ctx->iopoll_list with reads/writes
+ * 2. to track reqs with ->files (see io_op_def::file_table)
+ */
+ struct list_head inflight_entry;
+
+ struct percpu_ref *fixed_file_refs;
+ struct callback_head task_work;
+ /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+ struct hlist_node hash_node;
+ struct async_poll *apoll;
+ struct io_wq_work work;
+};
- union {
- /*
- * Only commands that never go async can use the below fields,
- * obviously. Right now only IORING_OP_POLL_ADD uses them, and
- * async armed poll handlers for regular commands. The latter
- * restore the work, if needed.
- */
- struct {
- struct callback_head task_work;
- struct hlist_node hash_node;
- struct async_poll *apoll;
- };
- struct io_wq_work work;
- };
+struct io_defer_entry {
+ struct list_head list;
+ struct io_kiocb *req;
+ u32 seq;
};
-#define IO_PLUG_THRESHOLD 2
#define IO_IOPOLL_BATCH 8
+struct io_comp_state {
+ unsigned int nr;
+ struct list_head list;
+ struct io_ring_ctx *ctx;
+};
+
struct io_submit_state {
struct blk_plug plug;
@@ -689,12 +686,16 @@ struct io_submit_state {
unsigned int free_reqs;
/*
+ * Batch completion logic
+ */
+ struct io_comp_state comp;
+
+ /*
* File reference cache
*/
struct file *file;
unsigned int fd;
unsigned int has_refs;
- unsigned int used_refs;
unsigned int ios_left;
};
@@ -722,6 +723,7 @@ struct io_op_def {
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
+ unsigned needs_fsize : 1;
};
static const struct io_op_def io_op_defs[] = {
@@ -741,6 +743,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
@@ -755,6 +758,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -807,6 +811,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
+ .needs_fsize = 1,
},
[IORING_OP_OPENAT] = {
.file_table = 1,
@@ -838,6 +843,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
@@ -880,22 +886,37 @@ static const struct io_op_def io_op_defs[] = {
},
};
-static void io_wq_submit_work(struct io_wq_work **workptr);
+enum io_mem_account {
+ ACCT_LOCKED,
+ ACCT_PINNED,
+};
+
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+ struct io_comp_state *cs);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
+static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
-static int io_grab_files(struct io_kiocb *req);
-static void io_complete_rw_common(struct kiocb *kiocb, long res);
-static void io_cleanup_req(struct io_kiocb *req);
+static int io_prep_work_files(struct io_kiocb *req);
+static void __io_clean_op(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req,
- const struct io_uring_sqe *sqe);
+ const struct io_uring_sqe *sqe,
+ struct io_comp_state *cs);
+static void io_file_put_work(struct work_struct *work);
+
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter,
+ bool needs_lock);
+static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
+ struct iovec *iovec, struct iovec *fast_iov,
+ struct iov_iter *iter);
static struct kmem_cache *req_cachep;
@@ -922,6 +943,12 @@ static void io_get_req_task(struct io_kiocb *req)
req->flags |= REQ_F_TASK_PINNED;
}
+static inline void io_clean_op(struct io_kiocb *req)
+{
+ if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+ __io_clean_op(req);
+}
+
/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
static void __io_put_req_task(struct io_kiocb *req)
{
@@ -929,7 +956,41 @@ static void __io_put_req_task(struct io_kiocb *req)
put_task_struct(req->task);
}
-static void io_file_put_work(struct work_struct *work);
+static void io_sq_thread_drop_mm(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ kthread_unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
+static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
+{
+ if (!current->mm) {
+ if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
+ !mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ kthread_use_mm(ctx->sqo_mm);
+ }
+
+ return 0;
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ if (!io_op_defs[req->opcode].needs_mm)
+ return 0;
+ return __io_sq_thread_acquire_mm(ctx);
+}
+
+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+ if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+}
/*
* Note: must call io_req_init_async() for the first time you
@@ -956,6 +1017,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
complete(&ctx->ref_comp);
}
+static inline bool io_is_timeout_noseq(struct io_kiocb *req)
+{
+ return !req->timeout.off;
+}
+
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
@@ -999,7 +1065,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
- INIT_LIST_HEAD(&ctx->poll_list);
+ INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
init_waitqueue_head(&ctx->inflight_wait);
@@ -1016,18 +1082,14 @@ err:
return NULL;
}
-static inline bool __req_need_defer(struct io_kiocb *req)
+static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
- struct io_ring_ctx *ctx = req->ctx;
+ if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
+ struct io_ring_ctx *ctx = req->ctx;
- return req->sequence != ctx->cached_cq_tail
+ return seq != ctx->cached_cq_tail
+ atomic_read(&ctx->cached_cq_overflow);
-}
-
-static inline bool req_need_defer(struct io_kiocb *req)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN))
- return __req_need_defer(req);
+ }
return false;
}
@@ -1045,28 +1107,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static inline void io_req_work_grab_env(struct io_kiocb *req,
- const struct io_op_def *def)
-{
- if (!req->work.mm && def->needs_mm) {
- mmgrab(current->mm);
- req->work.mm = current->mm;
- }
- if (!req->work.creds)
- req->work.creds = get_current_cred();
- if (!req->work.fs && def->needs_fs) {
- spin_lock(&current->fs->lock);
- if (!current->fs->in_exec) {
- req->work.fs = current->fs;
- req->work.fs->users++;
- } else {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- }
- spin_unlock(&current->fs->lock);
- }
-}
-
-static inline void io_req_work_drop_env(struct io_kiocb *req)
+static void io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
@@ -1088,11 +1129,12 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
spin_unlock(&req->work.fs->lock);
if (fs)
free_fs_struct(fs);
+ req->work.fs = NULL;
}
+ req->flags &= ~REQ_F_WORK_INITIALIZED;
}
-static inline void io_prep_async_work(struct io_kiocb *req,
- struct io_kiocb **link)
+static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -1105,18 +1147,42 @@ static inline void io_prep_async_work(struct io_kiocb *req,
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
+ if (!req->work.mm && def->needs_mm) {
+ mmgrab(current->mm);
+ req->work.mm = current->mm;
+ }
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
+ if (!req->work.fs && def->needs_fs) {
+ spin_lock(&current->fs->lock);
+ if (!current->fs->in_exec) {
+ req->work.fs = current->fs;
+ req->work.fs->users++;
+ } else {
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+ }
+ spin_unlock(&current->fs->lock);
+ }
+ if (def->needs_fsize)
+ req->work.fsize = rlimit(RLIMIT_FSIZE);
+ else
+ req->work.fsize = RLIM_INFINITY;
+}
- io_req_work_grab_env(req, def);
+static void io_prep_async_link(struct io_kiocb *req)
+{
+ struct io_kiocb *cur;
- *link = io_prep_linked_timeout(req);
+ io_prep_async_work(req);
+ if (req->flags & REQ_F_LINK_HEAD)
+ list_for_each_entry(cur, &req->link_list, link_list)
+ io_prep_async_work(cur);
}
-static inline void io_queue_async_work(struct io_kiocb *req)
+static void __io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *link;
-
- io_prep_async_work(req, &link);
+ struct io_kiocb *link = io_prep_linked_timeout(req);
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
@@ -1126,14 +1192,22 @@ static inline void io_queue_async_work(struct io_kiocb *req)
io_queue_linked_timeout(link);
}
+static void io_queue_async_work(struct io_kiocb *req)
+{
+ /* init ->work of the whole link before punting */
+ io_prep_async_link(req);
+ __io_queue_async_work(req);
+}
+
static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) {
- atomic_inc(&req->ctx->cq_timeouts);
- list_del_init(&req->list);
+ atomic_set(&req->ctx->cq_timeouts,
+ atomic_read(&req->ctx->cq_timeouts) + 1);
+ list_del_init(&req->timeout.list);
req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
io_put_req(req);
@@ -1145,7 +1219,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
struct io_kiocb *req, *tmp;
spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list)
io_kill_timeout(req);
spin_unlock_irq(&ctx->completion_lock);
}
@@ -1153,13 +1227,15 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
static void __io_queue_deferred(struct io_ring_ctx *ctx)
{
do {
- struct io_kiocb *req = list_first_entry(&ctx->defer_list,
- struct io_kiocb, list);
+ struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
+ struct io_defer_entry, list);
- if (req_need_defer(req))
+ if (req_need_defer(de->req, de->seq))
break;
- list_del_init(&req->list);
- io_queue_async_work(req);
+ list_del_init(&de->list);
+ /* punt-init is done before queueing for defer */
+ __io_queue_async_work(de->req);
+ kfree(de);
} while (!list_empty(&ctx->defer_list));
}
@@ -1167,15 +1243,15 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->timeout_list)) {
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
- struct io_kiocb, list);
+ struct io_kiocb, timeout.list);
- if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+ if (io_is_timeout_noseq(req))
break;
if (req->timeout.target_seq != ctx->cached_cq_tail
- atomic_read(&ctx->cq_timeouts))
break;
- list_del_init(&req->list);
+ list_del_init(&req->timeout.list);
io_kill_timeout(req);
}
}
@@ -1228,6 +1304,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
eventfd_signal(ctx->cq_ev_fd, 1);
}
+static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
+{
+ if (list_empty(&ctx->cq_overflow_list)) {
+ clear_bit(0, &ctx->sq_check_overflow);
+ clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ }
+}
+
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
@@ -1258,13 +1343,13 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
break;
req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
- list);
- list_move(&req->list, &list);
+ compl.list);
+ list_move(&req->compl.list, &list);
req->flags &= ~REQ_F_OVERFLOW;
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
- WRITE_ONCE(cqe->flags, req->cflags);
+ WRITE_ONCE(cqe->flags, req->compl.cflags);
} else {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1272,17 +1357,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
}
io_commit_cqring(ctx);
- if (cqe) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
- }
+ io_cqring_mark_overflow(ctx);
+
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
while (!list_empty(&list)) {
- req = list_first_entry(&list, struct io_kiocb, list);
- list_del(&req->list);
+ req = list_first_entry(&list, struct io_kiocb, compl.list);
+ list_del(&req->compl.list);
io_put_req(req);
}
@@ -1315,11 +1397,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
set_bit(0, &ctx->cq_check_overflow);
ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
+ io_clean_op(req);
req->flags |= REQ_F_OVERFLOW;
- refcount_inc(&req->refs);
req->result = res;
- req->cflags = cflags;
- list_add_tail(&req->list, &ctx->cq_overflow_list);
+ req->compl.cflags = cflags;
+ refcount_inc(&req->refs);
+ list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
}
}
@@ -1328,7 +1411,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
__io_cqring_fill_event(req, res, 0);
}
-static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
+static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
@@ -1341,9 +1424,52 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
io_cqring_ev_posted(ctx);
}
-static void io_cqring_add_event(struct io_kiocb *req, long res)
+static void io_submit_flush_completions(struct io_comp_state *cs)
+{
+ struct io_ring_ctx *ctx = cs->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ while (!list_empty(&cs->list)) {
+ struct io_kiocb *req;
+
+ req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
+ list_del(&req->compl.list);
+ __io_cqring_fill_event(req, req->result, req->compl.cflags);
+ if (!(req->flags & REQ_F_LINK_HEAD)) {
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
+ } else {
+ spin_unlock_irq(&ctx->completion_lock);
+ io_put_req(req);
+ spin_lock_irq(&ctx->completion_lock);
+ }
+ }
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+ cs->nr = 0;
+}
+
+static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
+ struct io_comp_state *cs)
+{
+ if (!cs) {
+ io_cqring_add_event(req, res, cflags);
+ io_put_req(req);
+ } else {
+ io_clean_op(req);
+ req->result = res;
+ req->compl.cflags = cflags;
+ list_add_tail(&req->compl.list, &cs->list);
+ if (++cs->nr >= 32)
+ io_submit_flush_completions(cs);
+ }
+}
+
+static void io_req_complete(struct io_kiocb *req, long res)
{
- __io_cqring_add_event(req, res, 0);
+ __io_req_complete(req, res, 0, NULL);
}
static inline bool io_is_fallback_req(struct io_kiocb *req)
@@ -1369,11 +1495,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
- if (!state) {
- req = kmem_cache_alloc(req_cachep, gfp);
- if (unlikely(!req))
- goto fallback;
- } else if (!state->free_reqs) {
+ if (!state->free_reqs) {
size_t sz;
int ret;
@@ -1411,21 +1533,15 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
fput(file);
}
-static void __io_req_aux_free(struct io_kiocb *req)
+static void io_dismantle_req(struct io_kiocb *req)
{
- if (req->flags & REQ_F_NEED_CLEANUP)
- io_cleanup_req(req);
+ io_clean_op(req);
- kfree(req->io);
+ if (req->io)
+ kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- __io_put_req_task(req);
- io_req_work_drop_env(req);
-}
-
-static void __io_free_req(struct io_kiocb *req)
-{
- __io_req_aux_free(req);
+ io_req_clean_work(req);
if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
@@ -1437,57 +1553,20 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
-
- percpu_ref_put(&req->ctx->refs);
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
- else
- clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
}
-struct req_batch {
- void *reqs[IO_IOPOLL_BATCH];
- int to_free;
- int need_iter;
-};
-
-static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
+static void __io_free_req(struct io_kiocb *req)
{
- if (!rb->to_free)
- return;
- if (rb->need_iter) {
- int i, inflight = 0;
- unsigned long flags;
-
- for (i = 0; i < rb->to_free; i++) {
- struct io_kiocb *req = rb->reqs[i];
-
- if (req->flags & REQ_F_INFLIGHT)
- inflight++;
- __io_req_aux_free(req);
- }
- if (!inflight)
- goto do_free;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- for (i = 0; i < rb->to_free; i++) {
- struct io_kiocb *req = rb->reqs[i];
-
- if (req->flags & REQ_F_INFLIGHT) {
- list_del(&req->inflight_entry);
- if (!--inflight)
- break;
- }
- }
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ struct io_ring_ctx *ctx;
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
- }
-do_free:
- kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- percpu_ref_put_many(&ctx->refs, rb->to_free);
- rb->to_free = rb->need_iter = 0;
+ io_dismantle_req(req);
+ __io_put_req_task(req);
+ ctx = req->ctx;
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
+ percpu_ref_put(&ctx->refs);
}
static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1507,53 +1586,67 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
return false;
}
-static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+static bool __io_kill_linked_timeout(struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+ bool wake_ev;
+
+ if (list_empty(&req->link_list))
+ return false;
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ if (link->opcode != IORING_OP_LINK_TIMEOUT)
+ return false;
+
+ list_del_init(&link->link_list);
+ wake_ev = io_link_cancel_timeout(link);
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+ return wake_ev;
+}
+
+static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- bool wake_ev = false;
+ bool wake_ev;
- /* Already got next link */
- if (req->flags & REQ_F_LINK_NEXT)
- return;
+ if (!(req->flags & REQ_F_COMP_LOCKED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ wake_ev = __io_kill_linked_timeout(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ wake_ev = __io_kill_linked_timeout(req);
+ }
+
+ if (wake_ev)
+ io_cqring_ev_posted(ctx);
+}
+
+static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt;
/*
* The list should never be empty when we are called here. But could
* potentially happen if the chain is messed up, check to be on the
* safe side.
*/
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *nxt = list_first_entry(&req->link_list,
- struct io_kiocb, link_list);
-
- if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
- (nxt->flags & REQ_F_TIMEOUT))) {
- list_del_init(&nxt->link_list);
- wake_ev |= io_link_cancel_timeout(nxt);
- req->flags &= ~REQ_F_LINK_TIMEOUT;
- continue;
- }
-
- list_del_init(&req->link_list);
- if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK_HEAD;
- *nxtptr = nxt;
- break;
- }
+ if (unlikely(list_empty(&req->link_list)))
+ return NULL;
- req->flags |= REQ_F_LINK_NEXT;
- if (wake_ev)
- io_cqring_ev_posted(ctx);
+ nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ list_del_init(&req->link_list);
+ if (!list_empty(&nxt->link_list))
+ nxt->flags |= REQ_F_LINK_HEAD;
+ return nxt;
}
/*
* Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
-static void io_fail_links(struct io_kiocb *req)
+static void __io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) {
struct io_kiocb *link = list_first_entry(&req->link_list,
@@ -1562,25 +1655,37 @@ static void io_fail_links(struct io_kiocb *req)
list_del_init(&link->link_list);
trace_io_uring_fail_link(req, link);
- if ((req->flags & REQ_F_LINK_TIMEOUT) &&
- link->opcode == IORING_OP_LINK_TIMEOUT) {
- io_link_cancel_timeout(link);
- } else {
- io_cqring_fill_event(link, -ECANCELED);
- __io_double_put_req(link);
- }
+ io_cqring_fill_event(link, -ECANCELED);
+ __io_double_put_req(link);
req->flags &= ~REQ_F_LINK_TIMEOUT;
}
io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
-static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_fail_links(struct io_kiocb *req)
{
- if (likely(!(req->flags & REQ_F_LINK_HEAD)))
- return;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(req->flags & REQ_F_COMP_LOCKED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ __io_fail_links(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ __io_fail_links(req);
+ }
+
+ io_cqring_ev_posted(ctx);
+}
+
+static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+{
+ req->flags &= ~REQ_F_LINK_HEAD;
+ if (req->flags & REQ_F_LINK_TIMEOUT)
+ io_kill_linked_timeout(req);
/*
* If LINK is set, we have dependent requests in this chain. If we
@@ -1588,62 +1693,187 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & REQ_F_FAIL_LINK) {
- io_fail_links(req);
- } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
- REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
+ if (likely(!(req->flags & REQ_F_FAIL_LINK)))
+ return io_req_link_next(req);
+ io_fail_links(req);
+ return NULL;
+}
- /*
- * If this is a timeout link, we could be racing with the
- * timeout timer. Grab the completion lock for this case to
- * protect against that.
- */
- spin_lock_irqsave(&ctx->completion_lock, flags);
- io_req_link_next(req, nxt);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+{
+ if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ return NULL;
+ return __io_req_find_next(req);
+}
+
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
+{
+ struct task_struct *tsk = req->task;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, notify = TWA_RESUME;
+
+ /*
+ * SQPOLL kernel thread doesn't need notification, just a wakeup.
+ * If we're not using an eventfd, then TWA_RESUME is always fine,
+ * as we won't have dependencies between request completions for
+ * other kernel wait conditions.
+ */
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ notify = 0;
+ else if (ctx->cq_ev_fd)
+ notify = TWA_SIGNAL;
+
+ ret = task_work_add(tsk, cb, notify);
+ if (!ret)
+ wake_up_process(tsk);
+ return ret;
+}
+
+static void __io_req_task_cancel(struct io_kiocb *req, int error)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ io_cqring_fill_event(req, error);
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+ req_set_fail_links(req);
+ io_double_put_req(req);
+}
+
+static void io_req_task_cancel(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+ __io_req_task_cancel(req, -ECANCELED);
+}
+
+static void __io_req_task_submit(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!__io_sq_thread_acquire_mm(ctx)) {
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(req, NULL, NULL);
+ mutex_unlock(&ctx->uring_lock);
} else {
- io_req_link_next(req, nxt);
+ __io_req_task_cancel(req, -EFAULT);
}
}
-static void io_free_req(struct io_kiocb *req)
+static void io_req_task_submit(struct callback_head *cb)
{
- struct io_kiocb *nxt = NULL;
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- io_req_find_next(req, &nxt);
- __io_free_req(req);
+ __io_req_task_submit(req);
+}
+
+static void io_req_task_queue(struct io_kiocb *req)
+{
+ int ret;
+
+ init_task_work(&req->task_work, io_req_task_submit);
+
+ ret = io_req_task_work_add(req, &req->task_work);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ init_task_work(&req->task_work, io_req_task_cancel);
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
+ }
+}
+
+static void io_queue_next(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt = io_req_find_next(req);
if (nxt)
- io_queue_async_work(nxt);
+ io_req_task_queue(nxt);
}
-static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
+static void io_free_req(struct io_kiocb *req)
{
- struct io_kiocb *link;
- const struct io_op_def *def = &io_op_defs[nxt->opcode];
+ io_queue_next(req);
+ __io_free_req(req);
+}
- if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
- io_wq_hash_work(&nxt->work, file_inode(nxt->file));
+struct req_batch {
+ void *reqs[IO_IOPOLL_BATCH];
+ int to_free;
- *workptr = &nxt->work;
- link = io_prep_linked_timeout(nxt);
- if (link)
- nxt->flags |= REQ_F_QUEUE_TIMEOUT;
+ struct task_struct *task;
+ int task_refs;
+};
+
+static inline void io_init_req_batch(struct req_batch *rb)
+{
+ rb->to_free = 0;
+ rb->task_refs = 0;
+ rb->task = NULL;
+}
+
+static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
+ struct req_batch *rb)
+{
+ kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+ percpu_ref_put_many(&ctx->refs, rb->to_free);
+ rb->to_free = 0;
+}
+
+static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
+ struct req_batch *rb)
+{
+ if (rb->to_free)
+ __io_req_free_batch_flush(ctx, rb);
+ if (rb->task) {
+ put_task_struct_many(rb->task, rb->task_refs);
+ rb->task = NULL;
+ }
+}
+
+static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
+{
+ if (unlikely(io_is_fallback_req(req))) {
+ io_free_req(req);
+ return;
+ }
+ if (req->flags & REQ_F_LINK_HEAD)
+ io_queue_next(req);
+
+ if (req->flags & REQ_F_TASK_PINNED) {
+ if (req->task != rb->task) {
+ if (rb->task)
+ put_task_struct_many(rb->task, rb->task_refs);
+ rb->task = req->task;
+ rb->task_refs = 0;
+ }
+ rb->task_refs++;
+ req->flags &= ~REQ_F_TASK_PINNED;
+ }
+
+ io_dismantle_req(req);
+ rb->reqs[rb->to_free++] = req;
+ if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
+ __io_req_free_batch_flush(req->ctx, rb);
}
/*
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
*/
-__attribute__((nonnull))
-static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
{
+ struct io_kiocb *nxt = NULL;
+
if (refcount_dec_and_test(&req->refs)) {
- io_req_find_next(req, nxtptr);
+ nxt = io_req_find_next(req);
__io_free_req(req);
}
+ return nxt;
}
static void io_put_req(struct io_kiocb *req)
@@ -1652,24 +1882,20 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static void io_steal_work(struct io_kiocb *req,
- struct io_wq_work **workptr)
+static struct io_wq_work *io_steal_work(struct io_kiocb *req)
{
+ struct io_kiocb *nxt;
+
/*
- * It's in an io-wq worker, so there always should be at least
- * one reference, which will be dropped in io_put_work() just
- * after the current handler returns.
- *
- * It also means, that if the counter dropped to 1, then there is
- * no asynchronous users left, so it's safe to steal the next work.
+ * A ref is owned by io-wq in which context we're. So, if that's the
+ * last one, it's safe to steal next work. False negatives are Ok,
+ * it just will be re-punted async in io_put_work()
*/
- if (refcount_read(&req->refs) == 1) {
- struct io_kiocb *nxt = NULL;
+ if (refcount_read(&req->refs) != 1)
+ return NULL;
- io_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
- }
+ nxt = io_req_find_next(req);
+ return nxt ? &nxt->work : NULL;
}
/*
@@ -1719,31 +1945,34 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}
-static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
+static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
{
- if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
- return false;
+ unsigned int cflags;
- if (req->file || req->io)
- rb->need_iter++;
-
- rb->reqs[rb->to_free++] = req;
- if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
- io_free_req_many(req->ctx, rb);
- return true;
+ cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
+ cflags |= IORING_CQE_F_BUFFER;
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ kfree(kbuf);
+ return cflags;
}
-static int io_put_kbuf(struct io_kiocb *req)
+static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
struct io_buffer *kbuf;
- int cflags;
kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- cflags |= IORING_CQE_F_BUFFER;
- req->rw.addr = 0;
- kfree(kbuf);
- return cflags;
+ return io_put_kbuf(req, kbuf);
+}
+
+static inline bool io_run_task_work(void)
+{
+ if (current->task_works) {
+ __set_current_state(TASK_RUNNING);
+ task_work_run();
+ return true;
+ }
+
+ return false;
}
static void io_iopoll_queue(struct list_head *again)
@@ -1751,18 +1980,9 @@ static void io_iopoll_queue(struct list_head *again)
struct io_kiocb *req;
do {
- req = list_first_entry(again, struct io_kiocb, list);
- list_del(&req->list);
-
- /* shouldn't happen unless io_uring is dying, cancel reqs */
- if (unlikely(!current->mm)) {
- io_complete_rw_common(&req->rw.kiocb, -EAGAIN);
- io_put_req(req);
- continue;
- }
-
- refcount_inc(&req->refs);
- io_queue_async_work(req);
+ req = list_first_entry(again, struct io_kiocb, inflight_entry);
+ list_del(&req->inflight_entry);
+ __io_complete_rw(req, -EAGAIN, 0, NULL);
} while (!list_empty(again));
}
@@ -1779,33 +1999,32 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
/* order with ->result store in io_complete_rw_iopoll() */
smp_rmb();
- rb.to_free = rb.need_iter = 0;
+ io_init_req_batch(&rb);
while (!list_empty(done)) {
int cflags = 0;
- req = list_first_entry(done, struct io_kiocb, list);
+ req = list_first_entry(done, struct io_kiocb, inflight_entry);
if (READ_ONCE(req->result) == -EAGAIN) {
req->iopoll_completed = 0;
- list_move_tail(&req->list, &again);
+ list_move_tail(&req->inflight_entry, &again);
continue;
}
- list_del(&req->list);
+ list_del(&req->inflight_entry);
if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_kbuf(req);
+ cflags = io_put_rw_kbuf(req);
__io_cqring_fill_event(req, req->result, cflags);
(*nr_events)++;
- if (refcount_dec_and_test(&req->refs) &&
- !io_req_multi_free(&rb, req))
- io_free_req(req);
+ if (refcount_dec_and_test(&req->refs))
+ io_req_free_batch(&rb, req);
}
io_commit_cqring(ctx);
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_ev_posted(ctx);
- io_free_req_many(ctx, &rb);
+ io_req_free_batch_finish(ctx, &rb);
if (!list_empty(&again))
io_iopoll_queue(&again);
@@ -1826,7 +2045,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
spin = !ctx->poll_multi_file && *nr_events < min;
ret = 0;
- list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
+ list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
struct kiocb *kiocb = &req->rw.kiocb;
/*
@@ -1835,7 +2054,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* and complete those lists first, if we have entries there.
*/
if (READ_ONCE(req->iopoll_completed)) {
- list_move_tail(&req->list, &done);
+ list_move_tail(&req->inflight_entry, &done);
continue;
}
if (!list_empty(&done))
@@ -1845,6 +2064,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (ret < 0)
break;
+ /* iopoll may have completed current req */
+ if (READ_ONCE(req->iopoll_completed))
+ list_move_tail(&req->inflight_entry, &done);
+
if (ret && spin)
spin = false;
ret = 0;
@@ -1864,13 +2087,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
long min)
{
- while (!list_empty(&ctx->poll_list) && !need_resched()) {
+ while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
int ret;
ret = io_do_iopoll(ctx, nr_events, min);
if (ret < 0)
return ret;
- if (!min || *nr_events >= min)
+ if (*nr_events >= min)
return 0;
}
@@ -1881,29 +2104,37 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
-static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
+static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_IOPOLL))
return;
mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->poll_list)) {
+ while (!list_empty(&ctx->iopoll_list)) {
unsigned int nr_events = 0;
- io_iopoll_getevents(ctx, &nr_events, 1);
+ io_do_iopoll(ctx, &nr_events, 0);
+ /* let it sleep and repeat later if can't complete a request */
+ if (nr_events == 0)
+ break;
/*
* Ensure we allow local-to-the-cpu processing to take place,
* in this case we need to ensure that we reap all events.
+ * Also let task_work, etc. to progress by releasing the mutex
*/
- cond_resched();
+ if (need_resched()) {
+ mutex_unlock(&ctx->uring_lock);
+ cond_resched();
+ mutex_lock(&ctx->uring_lock);
+ }
}
mutex_unlock(&ctx->uring_lock);
}
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
+static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
+ unsigned int nr_events = 0;
int iters = 0, ret = 0;
/*
@@ -1913,8 +2144,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
*/
mutex_lock(&ctx->uring_lock);
do {
- int tmin = 0;
-
/*
* Don't enter poll loop if we already have events pending.
* If we do, we can potentially be spinning for commands that
@@ -1935,17 +2164,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
*/
if (!(++iters & 7)) {
mutex_unlock(&ctx->uring_lock);
+ io_run_task_work();
mutex_lock(&ctx->uring_lock);
}
- if (*nr_events < min)
- tmin = min - *nr_events;
-
- ret = io_iopoll_getevents(ctx, nr_events, tmin);
+ ret = io_iopoll_getevents(ctx, &nr_events, min);
if (ret <= 0)
break;
ret = 0;
- } while (min && !*nr_events && !need_resched());
+ } while (min && !nr_events && !need_resched());
mutex_unlock(&ctx->uring_lock);
return ret;
@@ -1965,13 +2192,8 @@ static void kiocb_end_write(struct io_kiocb *req)
file_end_write(req->file);
}
-static inline void req_set_fail_links(struct io_kiocb *req)
-{
- if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
-}
-
-static void io_complete_rw_common(struct kiocb *kiocb, long res)
+static void io_complete_rw_common(struct kiocb *kiocb, long res,
+ struct io_comp_state *cs)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
int cflags = 0;
@@ -1982,16 +2204,96 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
if (res != req->result)
req_set_fail_links(req);
if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_kbuf(req);
- __io_cqring_add_event(req, res, cflags);
+ cflags = io_put_rw_kbuf(req);
+ __io_req_complete(req, res, cflags, cs);
+}
+
+#ifdef CONFIG_BLOCK
+static bool io_resubmit_prep(struct io_kiocb *req, int error)
+{
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ ssize_t ret = -ECANCELED;
+ struct iov_iter iter;
+ int rw;
+
+ if (error) {
+ ret = error;
+ goto end_req;
+ }
+
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
+ rw = READ;
+ break;
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
+ rw = WRITE;
+ break;
+ default:
+ printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
+ req->opcode);
+ goto end_req;
+ }
+
+ ret = io_import_iovec(rw, req, &iovec, &iter, false);
+ if (ret < 0)
+ goto end_req;
+ ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter);
+ if (!ret)
+ return true;
+ kfree(iovec);
+end_req:
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return false;
+}
+
+static void io_rw_resubmit(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
+ int err;
+
+ err = io_sq_thread_acquire_mm(ctx, req);
+
+ if (io_resubmit_prep(req, err)) {
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ }
+}
+#endif
+
+static bool io_rw_reissue(struct io_kiocb *req, long res)
+{
+#ifdef CONFIG_BLOCK
+ int ret;
+
+ if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
+ return false;
+
+ init_task_work(&req->task_work, io_rw_resubmit);
+ ret = io_req_task_work_add(req, &req->task_work);
+ if (!ret)
+ return true;
+#endif
+ return false;
+}
+
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+ struct io_comp_state *cs)
+{
+ if (!io_rw_reissue(req, res))
+ io_complete_rw_common(&req->rw.kiocb, res, cs);
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
- io_complete_rw_common(kiocb, res);
- io_put_req(req);
+ __io_complete_rw(req, res, res2, NULL);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -2025,13 +2327,13 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* how we do polling eventually, not spinning if we're on potentially
* different devices.
*/
- if (list_empty(&ctx->poll_list)) {
+ if (list_empty(&ctx->iopoll_list)) {
ctx->poll_multi_file = false;
} else if (!ctx->poll_multi_file) {
struct io_kiocb *list_req;
- list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
- list);
+ list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
+ inflight_entry);
if (list_req->file != req->file)
ctx->poll_multi_file = true;
}
@@ -2041,9 +2343,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
- list_add(&req->list, &ctx->poll_list);
+ list_add(&req->inflight_entry, &ctx->iopoll_list);
else
- list_add_tail(&req->list, &ctx->poll_list);
+ list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
if ((ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sqo_wait))
@@ -2052,10 +2354,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
static void __io_state_file_put(struct io_submit_state *state)
{
- int diff = state->has_refs - state->used_refs;
-
- if (diff)
- fput_many(state->file, diff);
+ if (state->has_refs)
+ fput_many(state->file, state->has_refs);
state->file = NULL;
}
@@ -2077,7 +2377,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
if (state->file) {
if (state->fd == fd) {
- state->used_refs++;
+ state->has_refs--;
state->ios_left--;
return state->file;
}
@@ -2088,12 +2388,20 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
return NULL;
state->fd = fd;
- state->has_refs = state->ios_left;
- state->used_refs = 1;
state->ios_left--;
+ state->has_refs = state->ios_left;
return state->file;
}
+static bool io_bdev_nowait(struct block_device *bdev)
+{
+#ifdef CONFIG_BLOCK
+ return !bdev || queue_is_mq(bdev_get_queue(bdev));
+#else
+ return true;
+#endif
+}
+
/*
* If we tracked the file through the SCM inflight mechanism, we could support
* any file. For now, just ensure that anything potentially problematic is done
@@ -2103,10 +2411,19 @@ static bool io_file_supports_async(struct file *file, int rw)
{
umode_t mode = file_inode(file)->i_mode;
- if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
- return true;
- if (S_ISREG(mode) && file->f_op != &io_uring_fops)
+ if (S_ISBLK(mode)) {
+ if (io_bdev_nowait(file->f_inode->i_bdev))
+ return true;
+ return false;
+ }
+ if (S_ISCHR(mode) || S_ISSOCK(mode))
return true;
+ if (S_ISREG(mode)) {
+ if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
+ file->f_op != &io_uring_fops)
+ return true;
+ return false;
+ }
/* any ->read/write should understand O_NONBLOCK */
if (file->f_flags & O_NONBLOCK)
@@ -2157,6 +2474,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (kiocb->ki_flags & IOCB_NOWAIT)
req->flags |= REQ_F_NOWAIT;
+ if (kiocb->ki_flags & IOCB_DIRECT)
+ io_get_req_task(req);
+
if (force_nonblock)
kiocb->ki_flags |= IOCB_NOWAIT;
@@ -2167,8 +2487,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
- req->result = 0;
req->iopoll_completed = 0;
+ io_get_req_task(req);
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -2202,14 +2522,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
-static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
+static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
+ struct io_comp_state *cs)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
- io_complete_rw(kiocb, ret, 0);
+ __io_complete_rw(req, ret, 0, cs);
else
io_rw_done(kiocb, ret);
}
@@ -2465,10 +2786,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
if (req->io) {
struct io_async_rw *iorw = &req->io->rw;
- *iovec = iorw->iov;
- iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
- if (iorw->iov == iorw->fast_iov)
- *iovec = NULL;
+ iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size);
+ *iovec = NULL;
return iorw->size;
}
@@ -2553,15 +2872,17 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter)
{
- req->io->rw.nr_segs = iter->nr_segs;
- req->io->rw.size = io_size;
- req->io->rw.iov = iovec;
- if (!req->io->rw.iov) {
- req->io->rw.iov = req->io->rw.fast_iov;
- if (req->io->rw.iov != fast_iov)
- memcpy(req->io->rw.iov, fast_iov,
+ struct io_async_rw *rw = &req->io->rw;
+
+ rw->nr_segs = iter->nr_segs;
+ rw->size = io_size;
+ if (!iovec) {
+ rw->iov = rw->fast_iov;
+ if (rw->iov != fast_iov)
+ memcpy(rw->iov, fast_iov,
sizeof(struct iovec) * iter->nr_segs);
} else {
+ rw->iov = iovec;
req->flags |= REQ_F_NEED_CLEANUP;
}
}
@@ -2595,11 +2916,27 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
return 0;
}
+static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
+ bool force_nonblock)
+{
+ struct io_async_ctx *io = req->io;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ io->rw.iov = io->rw.fast_iov;
+ req->io = NULL;
+ ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock);
+ req->io = io;
+ if (unlikely(ret < 0))
+ return ret;
+
+ io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+ return 0;
+}
+
static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
bool force_nonblock)
{
- struct io_async_ctx *io;
- struct iov_iter iter;
ssize_t ret;
ret = io_prep_rw(req, sqe, force_nonblock);
@@ -2612,75 +2949,169 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
/* either don't need iovec imported or already have it */
if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
+ return io_rw_prep_async(req, READ, force_nonblock);
+}
- io = req->io;
- io->rw.iov = io->rw.fast_iov;
- req->io = NULL;
- ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
- req->io = io;
- if (ret < 0)
- return ret;
+static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *arg)
+{
+ struct wait_page_queue *wpq;
+ struct io_kiocb *req = wait->private;
+ struct wait_page_key *key = arg;
+ int ret;
- io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
- return 0;
+ wpq = container_of(wait, struct wait_page_queue, wait);
+
+ if (!wake_page_match(wpq, key))
+ return 0;
+
+ /* Stop waking things up if the page is locked again */
+ if (test_bit(key->bit_nr, &key->page->flags))
+ return -1;
+
+ list_del_init(&wait->entry);
+
+ init_task_work(&req->task_work, io_req_task_submit);
+ /* submit ref gets dropped, acquire a new one */
+ refcount_inc(&req->refs);
+ ret = io_req_task_work_add(req, &req->task_work);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ /* queue just for cancelation */
+ init_task_work(&req->task_work, io_req_task_cancel);
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
+ }
+ return 1;
+}
+
+static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb,
+ struct wait_page_queue *wait,
+ wait_queue_func_t func,
+ void *data)
+{
+ /* Can't support async wakeup with polled IO */
+ if (kiocb->ki_flags & IOCB_HIPRI)
+ return -EINVAL;
+ if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) {
+ wait->wait.func = func;
+ wait->wait.private = data;
+ wait->wait.flags = 0;
+ INIT_LIST_HEAD(&wait->wait.entry);
+ kiocb->ki_flags |= IOCB_WAITQ;
+ kiocb->ki_waitq = wait;
+ return 0;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+
+static bool io_rw_should_retry(struct io_kiocb *req)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+ int ret;
+
+ /* never retry for NOWAIT, we just complete with -EAGAIN */
+ if (req->flags & REQ_F_NOWAIT)
+ return false;
+
+ /* already tried, or we're doing O_DIRECT */
+ if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ))
+ return false;
+ /*
+ * just use poll if we can, and don't attempt if the fs doesn't
+ * support callback based unlocks
+ */
+ if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
+ return false;
+
+ /*
+ * If request type doesn't require req->io to defer in general,
+ * we need to allocate it here
+ */
+ if (!req->io && __io_alloc_async_ctx(req))
+ return false;
+
+ ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
+ io_async_buf_func, req);
+ if (!ret) {
+ io_get_req_task(req);
+ return true;
+ }
+
+ return false;
+}
+
+static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
+{
+ if (req->file->f_op->read_iter)
+ return call_read_iter(req->file, &req->rw.kiocb, iter);
+ return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
}
-static int io_read(struct io_kiocb *req, bool force_nonblock)
+static int io_read(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter iter;
size_t iov_count;
- ssize_t io_size, ret;
+ ssize_t io_size, ret, ret2;
+ unsigned long nr_segs;
ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
+ io_size = ret;
+ req->result = io_size;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
kiocb->ki_flags &= ~IOCB_NOWAIT;
- req->result = 0;
- io_size = ret;
- if (req->flags & REQ_F_LINK_HEAD)
- req->result = io_size;
-
- /*
- * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
- * we know to async punt it even if it was opened O_NONBLOCK
- */
+ /* If the file doesn't support async, just async punt */
if (force_nonblock && !io_file_supports_async(req->file, READ))
goto copy_iov;
iov_count = iov_iter_count(&iter);
+ nr_segs = iter.nr_segs;
ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
- if (!ret) {
- ssize_t ret2;
+ if (unlikely(ret))
+ goto out_free;
- if (req->file->f_op->read_iter)
- ret2 = call_read_iter(req->file, kiocb, &iter);
- else
- ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
+ ret2 = io_iter_do_read(req, &iter);
- /* Catch -EAGAIN return for forced non-blocking submission */
- if (!force_nonblock || ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2);
- } else {
+ /* Catch -EAGAIN return for forced non-blocking submission */
+ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
+ kiocb_done(kiocb, ret2, cs);
+ } else {
+ iter.count = iov_count;
+ iter.nr_segs = nr_segs;
copy_iov:
- ret = io_setup_async_rw(req, io_size, iovec,
- inline_vecs, &iter);
- if (ret)
+ ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
+ &iter);
+ if (ret)
+ goto out_free;
+ /* it's copied and will be cleaned with ->io */
+ iovec = NULL;
+ /* if we can retry, do so with the callbacks armed */
+ if (io_rw_should_retry(req)) {
+ ret2 = io_iter_do_read(req, &iter);
+ if (ret2 == -EIOCBQUEUED) {
goto out_free;
- /* any defer here is final, must blocking retry */
- if (!(req->flags & REQ_F_NOWAIT) &&
- !file_can_poll(req->file))
- req->flags |= REQ_F_MUST_PUNT;
- return -EAGAIN;
+ } else if (ret2 != -EAGAIN) {
+ kiocb_done(kiocb, ret2, cs);
+ goto out_free;
+ }
}
+ kiocb->ki_flags &= ~IOCB_WAITQ;
+ return -EAGAIN;
}
out_free:
- if (!(req->flags & REQ_F_NEED_CLEANUP))
+ if (iovec)
kfree(iovec);
return ret;
}
@@ -2688,8 +3119,6 @@ out_free:
static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
bool force_nonblock)
{
- struct io_async_ctx *io;
- struct iov_iter iter;
ssize_t ret;
ret = io_prep_rw(req, sqe, force_nonblock);
@@ -2699,49 +3128,33 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- req->fsize = rlimit(RLIMIT_FSIZE);
-
/* either don't need iovec imported or already have it */
if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
-
- io = req->io;
- io->rw.iov = io->rw.fast_iov;
- req->io = NULL;
- ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
- req->io = io;
- if (ret < 0)
- return ret;
-
- io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
- return 0;
+ return io_rw_prep_async(req, WRITE, force_nonblock);
}
-static int io_write(struct io_kiocb *req, bool force_nonblock)
+static int io_write(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter iter;
size_t iov_count;
- ssize_t ret, io_size;
+ ssize_t ret, ret2, io_size;
+ unsigned long nr_segs;
ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
+ io_size = ret;
+ req->result = io_size;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
- req->result = 0;
- io_size = ret;
- if (req->flags & REQ_F_LINK_HEAD)
- req->result = io_size;
-
- /*
- * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
- * we know to async punt it even if it was opened O_NONBLOCK
- */
+ /* If the file doesn't support async, just async punt */
if (force_nonblock && !io_file_supports_async(req->file, WRITE))
goto copy_iov;
@@ -2751,59 +3164,53 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
goto copy_iov;
iov_count = iov_iter_count(&iter);
+ nr_segs = iter.nr_segs;
ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
- if (!ret) {
- ssize_t ret2;
-
- /*
- * Open-code file_start_write here to grab freeze protection,
- * which will be released by another thread in
- * io_complete_rw(). Fool lockdep by telling it the lock got
- * released so that it doesn't complain about the held lock when
- * we return to userspace.
- */
- if (req->flags & REQ_F_ISREG) {
- __sb_start_write(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE, true);
- __sb_writers_release(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE);
- }
- kiocb->ki_flags |= IOCB_WRITE;
-
- if (!force_nonblock)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+ if (unlikely(ret))
+ goto out_free;
- if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, &iter);
- else
- ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+ /*
+ * Open-code file_start_write here to grab freeze protection,
+ * which will be released by another thread in
+ * io_complete_rw(). Fool lockdep by telling it the lock got
+ * released so that it doesn't complain about the held lock when
+ * we return to userspace.
+ */
+ if (req->flags & REQ_F_ISREG) {
+ __sb_start_write(file_inode(req->file)->i_sb,
+ SB_FREEZE_WRITE, true);
+ __sb_writers_release(file_inode(req->file)->i_sb,
+ SB_FREEZE_WRITE);
+ }
+ kiocb->ki_flags |= IOCB_WRITE;
- if (!force_nonblock)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ if (req->file->f_op->write_iter)
+ ret2 = call_write_iter(req->file, kiocb, &iter);
+ else
+ ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
- /*
- * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
- * retry them without IOCB_NOWAIT.
- */
- if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
- ret2 = -EAGAIN;
- if (!force_nonblock || ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2);
- } else {
+ /*
+ * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
+ * retry them without IOCB_NOWAIT.
+ */
+ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
+ ret2 = -EAGAIN;
+ if (!force_nonblock || ret2 != -EAGAIN) {
+ kiocb_done(kiocb, ret2, cs);
+ } else {
+ iter.count = iov_count;
+ iter.nr_segs = nr_segs;
copy_iov:
- ret = io_setup_async_rw(req, io_size, iovec,
- inline_vecs, &iter);
- if (ret)
- goto out_free;
- /* any defer here is final, must blocking retry */
- if (!(req->flags & REQ_F_NOWAIT) &&
- !file_can_poll(req->file))
- req->flags |= REQ_F_MUST_PUNT;
- return -EAGAIN;
- }
+ ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
+ &iter);
+ if (ret)
+ goto out_free;
+ /* it's copied and will be cleaned with ->io */
+ iovec = NULL;
+ return -EAGAIN;
}
out_free:
- if (!(req->flags & REQ_F_NEED_CLEANUP))
+ if (iovec)
kfree(iovec);
return ret;
}
@@ -2869,10 +3276,9 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock)
io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
req->flags &= ~REQ_F_NEED_CLEANUP;
- io_cqring_add_event(req, ret);
if (ret != sp->len)
req_set_fail_links(req);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -2906,25 +3312,23 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
req->flags &= ~REQ_F_NEED_CLEANUP;
- io_cqring_add_event(req, ret);
if (ret != sp->len)
req_set_fail_links(req);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
-static int io_nop(struct io_kiocb *req)
+static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- io_cqring_add_event(req, 0);
- io_put_req(req);
+ __io_req_complete(req, 0, 0, cs);
return 0;
}
@@ -2963,8 +3367,7 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
req->sync.flags & IORING_FSYNC_DATASYNC);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -2979,7 +3382,6 @@ static int io_fallocate_prep(struct io_kiocb *req,
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
req->sync.mode = READ_ONCE(sqe->len);
- req->fsize = rlimit(RLIMIT_FSIZE);
return 0;
}
@@ -2990,15 +3392,11 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
/* fallocate always requiring blocking context */
if (force_nonblock)
return -EAGAIN;
-
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3094,8 +3492,7 @@ err:
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3149,7 +3546,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
return i;
}
-static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
+static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
@@ -3168,8 +3566,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
io_ring_submit_lock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -3227,7 +3624,8 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
return i ? i : -ENOMEM;
}
-static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
+static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
@@ -3256,8 +3654,7 @@ out:
io_ring_submit_unlock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -3288,7 +3685,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#endif
}
-static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
+static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
#if defined(CONFIG_EPOLL)
struct io_epoll *ie = &req->epoll;
@@ -3300,8 +3698,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
#else
return -EOPNOTSUPP;
@@ -3337,8 +3734,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock)
ret = do_madvise(ma->addr, ma->len, ma->advice);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
#else
return -EOPNOTSUPP;
@@ -3377,8 +3773,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3417,8 +3812,7 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3449,7 +3843,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_close(struct io_kiocb *req, bool force_nonblock)
+static int io_close(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_close *close = &req->close;
int ret;
@@ -3463,8 +3858,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
/* if the file has a flush method, be safe and punt to async */
if (close->put_file->f_op->flush && force_nonblock) {
+ /* was never set, but play safe */
+ req->flags &= ~REQ_F_NOWAIT;
/* avoid grabbing files - we don't need the files */
- req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
+ req->flags |= REQ_F_NO_FILE_TABLE;
return -EAGAIN;
}
@@ -3472,10 +3869,9 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
ret = filp_close(close->put_file, req->work.files);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
fput(close->put_file);
close->put_file = NULL;
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -3509,8 +3905,7 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -3530,6 +3925,15 @@ static int io_setup_async_msg(struct io_kiocb *req,
return -EAGAIN;
}
+static int io_sendmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
+{
+ iomsg->iov = iomsg->fast_iov;
+ iomsg->msg.msg_name = &iomsg->addr;
+ return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
+ req->sr_msg.msg_flags, &iomsg->iov);
+}
+
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
@@ -3540,7 +3944,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
- sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
#ifdef CONFIG_COMPAT
@@ -3554,136 +3958,126 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
- io->msg.msg.msg_name = &io->msg.addr;
- io->msg.iov = io->msg.fast_iov;
- ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
- &io->msg.iov);
+ ret = io_sendmsg_copy_hdr(req, &io->msg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
}
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
- struct io_async_msghdr *kmsg = NULL;
+ struct io_async_msghdr iomsg, *kmsg;
struct socket *sock;
+ unsigned flags;
int ret;
sock = sock_from_file(req->file, &ret);
- if (sock) {
- struct io_async_ctx io;
- unsigned flags;
-
- if (req->io) {
- kmsg = &req->io->msg;
- kmsg->msg.msg_name = &req->io->msg.addr;
- /* if iov is set, it's allocated already */
- if (!kmsg->iov)
- kmsg->iov = kmsg->fast_iov;
- kmsg->msg.msg_iter.iov = kmsg->iov;
- } else {
- struct io_sr_msg *sr = &req->sr_msg;
-
- kmsg = &io.msg;
- kmsg->msg.msg_name = &io.msg.addr;
+ if (unlikely(!sock))
+ return ret;
- io.msg.iov = io.msg.fast_iov;
- ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
- sr->msg_flags, &io.msg.iov);
- if (ret)
- return ret;
- }
+ if (req->io) {
+ kmsg = &req->io->msg;
+ kmsg->msg.msg_name = &req->io->msg.addr;
+ /* if iov is set, it's allocated already */
+ if (!kmsg->iov)
+ kmsg->iov = kmsg->fast_iov;
+ kmsg->msg.msg_iter.iov = kmsg->iov;
+ } else {
+ ret = io_sendmsg_copy_hdr(req, &iomsg);
+ if (ret)
+ return ret;
+ kmsg = &iomsg;
+ }
- flags = req->sr_msg.msg_flags;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
- ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
- if (force_nonblock && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- }
+ ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
- if (kmsg && kmsg->iov != kmsg->fast_iov)
+ if (kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
-static int io_send(struct io_kiocb *req, bool force_nonblock)
+static int io_send(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct msghdr msg;
+ struct iovec iov;
struct socket *sock;
+ unsigned flags;
int ret;
sock = sock_from_file(req->file, &ret);
- if (sock) {
- struct io_sr_msg *sr = &req->sr_msg;
- struct msghdr msg;
- struct iovec iov;
- unsigned flags;
+ if (unlikely(!sock))
+ return ret;
- ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
- &msg.msg_iter);
- if (ret)
- return ret;
+ ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
+ if (unlikely(ret))
+ return ret;;
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
- flags = req->sr_msg.msg_flags;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
- msg.msg_flags = flags;
- ret = sock_sendmsg(sock, &msg);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- }
+ msg.msg_flags = flags;
+ ret = sock_sendmsg(sock, &msg);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
- io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
-static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
{
struct io_sr_msg *sr = &req->sr_msg;
struct iovec __user *uiov;
size_t iov_len;
int ret;
- ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
- &uiov, &iov_len);
+ ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
+ &iomsg->uaddr, &uiov, &iov_len);
if (ret)
return ret;
if (req->flags & REQ_F_BUFFER_SELECT) {
if (iov_len > 1)
return -EINVAL;
- if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
+ if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov)))
return -EFAULT;
- sr->len = io->msg.iov[0].iov_len;
- iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
+ sr->len = iomsg->iov[0].iov_len;
+ iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1,
sr->len);
- io->msg.iov = NULL;
+ iomsg->iov = NULL;
} else {
ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
- &io->msg.iov, &io->msg.msg.msg_iter);
+ &iomsg->iov, &iomsg->msg.msg_iter);
if (ret > 0)
ret = 0;
}
@@ -3693,7 +4087,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
#ifdef CONFIG_COMPAT
static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_ctx *io)
+ struct io_async_msghdr *iomsg)
{
struct compat_msghdr __user *msg_compat;
struct io_sr_msg *sr = &req->sr_msg;
@@ -3702,8 +4096,8 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
compat_size_t len;
int ret;
- msg_compat = (struct compat_msghdr __user *) sr->msg;
- ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
+ msg_compat = (struct compat_msghdr __user *) sr->umsg;
+ ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
&ptr, &len);
if (ret)
return ret;
@@ -3720,12 +4114,12 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
return -EFAULT;
if (clen < 0)
return -EINVAL;
- sr->len = io->msg.iov[0].iov_len;
- io->msg.iov = NULL;
+ sr->len = iomsg->iov[0].iov_len;
+ iomsg->iov = NULL;
} else {
ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
- &io->msg.iov,
- &io->msg.msg.msg_iter);
+ &iomsg->iov,
+ &iomsg->msg.msg_iter);
if (ret < 0)
return ret;
}
@@ -3734,40 +4128,40 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
}
#endif
-static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+static int io_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
{
- io->msg.msg.msg_name = &io->msg.addr;
- io->msg.iov = io->msg.fast_iov;
+ iomsg->msg.msg_name = &iomsg->addr;
+ iomsg->iov = iomsg->fast_iov;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
- return __io_compat_recvmsg_copy_hdr(req, io);
+ return __io_compat_recvmsg_copy_hdr(req, iomsg);
#endif
- return __io_recvmsg_copy_hdr(req, io);
+ return __io_recvmsg_copy_hdr(req, iomsg);
}
static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- int *cflags, bool needs_lock)
+ bool needs_lock)
{
struct io_sr_msg *sr = &req->sr_msg;
struct io_buffer *kbuf;
- if (!(req->flags & REQ_F_BUFFER_SELECT))
- return NULL;
-
kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
if (IS_ERR(kbuf))
return kbuf;
sr->kbuf = kbuf;
req->flags |= REQ_F_BUFFER_SELECTED;
-
- *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- *cflags |= IORING_CQE_F_BUFFER;
return kbuf;
}
+static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
+{
+ return io_put_kbuf(req, req->sr_msg.kbuf);
+}
+
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -3779,7 +4173,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
- sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->bgid = READ_ONCE(sqe->buf_group);
@@ -3794,133 +4188,123 @@ static int io_recvmsg_prep(struct io_kiocb *req,
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
- ret = io_recvmsg_copy_hdr(req, io);
+ ret = io_recvmsg_copy_hdr(req, &io->msg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
}
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
- struct io_async_msghdr *kmsg = NULL;
+ struct io_async_msghdr iomsg, *kmsg;
struct socket *sock;
+ struct io_buffer *kbuf;
+ unsigned flags;
int ret, cflags = 0;
sock = sock_from_file(req->file, &ret);
- if (sock) {
- struct io_buffer *kbuf;
- struct io_async_ctx io;
- unsigned flags;
-
- if (req->io) {
- kmsg = &req->io->msg;
- kmsg->msg.msg_name = &req->io->msg.addr;
- /* if iov is set, it's allocated already */
- if (!kmsg->iov)
- kmsg->iov = kmsg->fast_iov;
- kmsg->msg.msg_iter.iov = kmsg->iov;
- } else {
- kmsg = &io.msg;
- kmsg->msg.msg_name = &io.msg.addr;
+ if (unlikely(!sock))
+ return ret;
- ret = io_recvmsg_copy_hdr(req, &io);
- if (ret)
- return ret;
- }
+ if (req->io) {
+ kmsg = &req->io->msg;
+ kmsg->msg.msg_name = &req->io->msg.addr;
+ /* if iov is set, it's allocated already */
+ if (!kmsg->iov)
+ kmsg->iov = kmsg->fast_iov;
+ kmsg->msg.msg_iter.iov = kmsg->iov;
+ } else {
+ ret = io_recvmsg_copy_hdr(req, &iomsg);
+ if (ret)
+ return ret;
+ kmsg = &iomsg;
+ }
- kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
- if (IS_ERR(kbuf)) {
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ kbuf = io_recv_buffer_select(req, !force_nonblock);
+ if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- } else if (kbuf) {
- kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
- 1, req->sr_msg.len);
- }
+ kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+ iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
+ 1, req->sr_msg.len);
+ }
- flags = req->sr_msg.msg_flags;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
- ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
- kmsg->uaddr, flags);
- if (force_nonblock && ret == -EAGAIN) {
- ret = io_setup_async_msg(req, kmsg);
- if (ret != -EAGAIN)
- kfree(kbuf);
- return ret;
- }
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (kbuf)
- kfree(kbuf);
- }
+ ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
+ kmsg->uaddr, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
- if (kmsg && kmsg->iov != kmsg->fast_iov)
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_recv_kbuf(req);
+ if (kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- __io_cqring_add_event(req, ret, cflags);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ __io_req_complete(req, ret, cflags, cs);
return 0;
}
-static int io_recv(struct io_kiocb *req, bool force_nonblock)
+static int io_recv(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
- struct io_buffer *kbuf = NULL;
+ struct io_buffer *kbuf;
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct msghdr msg;
+ void __user *buf = sr->buf;
struct socket *sock;
+ struct iovec iov;
+ unsigned flags;
int ret, cflags = 0;
sock = sock_from_file(req->file, &ret);
- if (sock) {
- struct io_sr_msg *sr = &req->sr_msg;
- void __user *buf = sr->buf;
- struct msghdr msg;
- struct iovec iov;
- unsigned flags;
+ if (unlikely(!sock))
+ return ret;
- kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ kbuf = io_recv_buffer_select(req, !force_nonblock);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- else if (kbuf)
- buf = u64_to_user_ptr(kbuf->addr);
+ buf = u64_to_user_ptr(kbuf->addr);
+ }
- ret = import_single_range(READ, buf, sr->len, &iov,
- &msg.msg_iter);
- if (ret) {
- kfree(kbuf);
- return ret;
- }
+ ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
+ if (unlikely(ret))
+ goto out_free;
- req->flags |= REQ_F_NEED_CLEANUP;
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
- msg.msg_iocb = NULL;
- msg.msg_flags = 0;
-
- flags = req->sr_msg.msg_flags;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
-
- ret = sock_recvmsg(sock, &msg, flags);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- }
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iocb = NULL;
+ msg.msg_flags = 0;
- kfree(kbuf);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- __io_cqring_add_event(req, ret, cflags);
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+
+ ret = sock_recvmsg(sock, &msg, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+out_free:
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_recv_kbuf(req);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ __io_req_complete(req, ret, cflags, cs);
return 0;
}
@@ -3940,7 +4324,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_accept *accept = &req->accept;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -3959,8 +4344,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock)
ret = -EINTR;
req_set_fail_links(req);
}
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -3984,7 +4368,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
&io->connect.address);
}
-static int io_connect(struct io_kiocb *req, bool force_nonblock)
+static int io_connect(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_async_ctx __io, *io;
unsigned file_flags;
@@ -4020,8 +4405,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock)
out:
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
#else /* !CONFIG_NET */
@@ -4030,12 +4414,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
}
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
-static int io_send(struct io_kiocb *req, bool force_nonblock)
+static int io_send(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
@@ -4046,12 +4432,14 @@ static int io_recvmsg_prep(struct io_kiocb *req,
return -EOPNOTSUPP;
}
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
-static int io_recv(struct io_kiocb *req, bool force_nonblock)
+static int io_recv(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
@@ -4061,7 +4449,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
}
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
@@ -4071,7 +4460,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
}
-static int io_connect(struct io_kiocb *req, bool force_nonblock)
+static int io_connect(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
return -EOPNOTSUPP;
}
@@ -4083,33 +4473,9 @@ struct io_poll_table {
int error;
};
-static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
-{
- struct task_struct *tsk = req->task;
- struct io_ring_ctx *ctx = req->ctx;
- int ret, notify = TWA_RESUME;
-
- /*
- * SQPOLL kernel thread doesn't need notification, just a wakeup.
- * If we're not using an eventfd, then TWA_RESUME is always fine,
- * as we won't have dependencies between request completions for
- * other kernel wait conditions.
- */
- if (ctx->flags & IORING_SETUP_SQPOLL)
- notify = 0;
- else if (ctx->cq_ev_fd)
- notify = TWA_SIGNAL;
-
- ret = task_work_add(tsk, cb, notify);
- if (!ret)
- wake_up_process(tsk);
- return ret;
-}
-
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
- struct task_struct *tsk;
int ret;
/* for instances that support it check for an event match first: */
@@ -4120,7 +4486,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
list_del_init(&poll->wait.entry);
- tsk = req->task;
req->result = mask;
init_task_work(&req->task_work, func);
/*
@@ -4131,6 +4496,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
*/
ret = io_req_task_work_add(req, &req->task_work);
if (unlikely(ret)) {
+ struct task_struct *tsk;
+
WRITE_ONCE(poll->canceled, true);
tsk = io_wq_get_task(req->ctx->io_wq);
task_work_add(tsk, &req->task_work, 0);
@@ -4159,9 +4526,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
return false;
}
-static void io_poll_remove_double(struct io_kiocb *req)
+static void io_poll_remove_double(struct io_kiocb *req, void *data)
{
- struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ struct io_poll_iocb *poll = data;
lockdep_assert_held(&req->ctx->completion_lock);
@@ -4181,7 +4548,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
- io_poll_remove_double(req);
+ io_poll_remove_double(req, req->io);
req->poll.done = true;
io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
@@ -4199,7 +4566,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
req->flags |= REQ_F_COMP_LOCKED;
- io_put_req_find_next(req, nxt);
+ *nxt = io_put_req_find_next(req);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
@@ -4211,34 +4578,29 @@ static void io_poll_task_func(struct callback_head *cb)
struct io_kiocb *nxt = NULL;
io_poll_task_handler(req, &nxt);
- if (nxt) {
- struct io_ring_ctx *ctx = nxt->ctx;
-
- mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(nxt, NULL);
- mutex_unlock(&ctx->uring_lock);
- }
+ if (nxt)
+ __io_req_task_submit(nxt);
}
static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ struct io_poll_iocb *poll = req->apoll->double_poll;
__poll_t mask = key_to_poll(key);
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
- if (req->poll.head) {
+ if (poll && poll->head) {
bool done;
- spin_lock(&req->poll.head->lock);
- done = list_empty(&req->poll.wait.entry);
+ spin_lock(&poll->head->lock);
+ done = list_empty(&poll->wait.entry);
if (!done)
- list_del_init(&req->poll.wait.entry);
- spin_unlock(&req->poll.head->lock);
+ list_del_init(&poll->wait.entry);
+ spin_unlock(&poll->head->lock);
if (!done)
__io_async_wake(req, poll, mask, io_poll_task_func);
}
@@ -4258,7 +4620,8 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
}
static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
- struct wait_queue_head *head)
+ struct wait_queue_head *head,
+ struct io_poll_iocb **poll_ptr)
{
struct io_kiocb *req = pt->req;
@@ -4269,7 +4632,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
*/
if (unlikely(poll->head)) {
/* already have a 2nd entry, fail a third attempt */
- if (req->io) {
+ if (*poll_ptr) {
pt->error = -EINVAL;
return;
}
@@ -4281,42 +4644,25 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
refcount_inc(&req->refs);
poll->wait.private = req;
- req->io = (void *) poll;
+ *poll_ptr = poll;
}
pt->error = 0;
poll->head = head;
- add_wait_queue(head, &poll->wait);
+
+ if (poll->events & EPOLLEXCLUSIVE)
+ add_wait_queue_exclusive(head, &poll->wait);
+ else
+ add_wait_queue(head, &poll->wait);
}
static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+ struct async_poll *apoll = pt->req->apoll;
- __io_queue_proc(&pt->req->apoll->poll, pt, head);
-}
-
-static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
-{
- struct mm_struct *mm = current->mm;
-
- if (mm) {
- kthread_unuse_mm(mm);
- mmput(mm);
- }
-}
-
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- if (io_op_defs[req->opcode].needs_mm && !current->mm) {
- if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
- return -EFAULT;
- kthread_use_mm(ctx->sqo_mm);
- }
-
- return 0;
+ __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
static void io_async_task_func(struct callback_head *cb)
@@ -4324,7 +4670,6 @@ static void io_async_task_func(struct callback_head *cb)
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
- bool canceled = false;
trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
@@ -4334,38 +4679,19 @@ static void io_async_task_func(struct callback_head *cb)
}
/* If req is still hashed, it cannot have been canceled. Don't check. */
- if (hash_hashed(&req->hash_node)) {
+ if (hash_hashed(&req->hash_node))
hash_del(&req->hash_node);
- } else {
- canceled = READ_ONCE(apoll->poll.canceled);
- if (canceled) {
- io_cqring_fill_event(req, -ECANCELED);
- io_commit_cqring(ctx);
- }
- }
+ io_poll_remove_double(req, apoll->double_poll);
spin_unlock_irq(&ctx->completion_lock);
- /* restore ->work in case we need to retry again */
- if (req->flags & REQ_F_WORK_INITIALIZED)
- memcpy(&req->work, &apoll->work, sizeof(req->work));
- kfree(apoll);
+ if (!READ_ONCE(apoll->poll.canceled))
+ __io_req_task_submit(req);
+ else
+ __io_req_task_cancel(req, -ECANCELED);
- if (!canceled) {
- __set_current_state(TASK_RUNNING);
- if (io_sq_thread_acquire_mm(ctx, req)) {
- io_cqring_add_event(req, -EFAULT);
- goto end_req;
- }
- mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(req, NULL);
- mutex_unlock(&ctx->uring_lock);
- } else {
- io_cqring_ev_posted(ctx);
-end_req:
- req_set_fail_links(req);
- io_double_put_req(req);
- }
+ kfree(apoll->double_poll);
+ kfree(apoll);
}
static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -4398,8 +4724,8 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
struct io_ring_ctx *ctx = req->ctx;
bool cancel = false;
- poll->file = req->file;
io_init_poll_iocb(poll, mask, wake_func);
+ poll->file = req->file;
poll->wait.private = req;
ipt->pt._key = mask;
@@ -4436,11 +4762,10 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask, ret;
- bool had_io;
if (!req->file || !file_can_poll(req->file))
return false;
- if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
+ if (req->flags & REQ_F_POLLED)
return false;
if (!def->pollin && !def->pollout)
return false;
@@ -4448,12 +4773,9 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return false;
+ apoll->double_poll = NULL;
req->flags |= REQ_F_POLLED;
- if (req->flags & REQ_F_WORK_INITIALIZED)
- memcpy(&apoll->work, &req->work, sizeof(req->work));
- had_io = req->io != NULL;
-
io_get_req_task(req);
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4470,13 +4792,9 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
if (ret) {
- ipt.error = 0;
- /* only remove double add if we did it here */
- if (!had_io)
- io_poll_remove_double(req);
+ io_poll_remove_double(req, apoll->double_poll);
spin_unlock_irq(&ctx->completion_lock);
- if (req->flags & REQ_F_WORK_INITIALIZED)
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
return false;
}
@@ -4507,23 +4825,18 @@ static bool io_poll_remove_one(struct io_kiocb *req)
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
- io_poll_remove_double(req);
+ io_poll_remove_double(req, req->io);
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
struct async_poll *apoll = req->apoll;
+ io_poll_remove_double(req, apoll->double_poll);
+
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &apoll->poll);
if (do_complete) {
io_put_req(req);
- /*
- * restore ->work because we will call
- * io_req_work_drop_env below when dropping the
- * final reference.
- */
- if (req->flags & REQ_F_WORK_INITIALIZED)
- memcpy(&req->work, &apoll->work,
- sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
}
}
@@ -4603,10 +4916,9 @@ static int io_poll_remove(struct io_kiocb *req)
ret = io_poll_cancel(ctx, addr);
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
+ io_req_complete(req, ret);
return 0;
}
@@ -4624,13 +4936,13 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- __io_queue_proc(&pt->req->poll, pt, head);
+ __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io);
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll_iocb *poll = &req->poll;
- u16 events;
+ u32 events;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4639,8 +4951,12 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
if (!poll->file)
return -EBADF;
- events = READ_ONCE(sqe->poll_events);
- poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+ events = READ_ONCE(sqe->poll32_events);
+#ifdef __BIG_ENDIAN
+ events = swahw32(events);
+#endif
+ poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
+ (events & EPOLLEXCLUSIVE);
io_get_req_task(req);
return 0;
@@ -4654,7 +4970,6 @@ static int io_poll_add(struct io_kiocb *req)
__poll_t mask;
INIT_HLIST_NODE(&req->hash_node);
- INIT_LIST_HEAD(&req->list);
ipt.pt._qproc = io_poll_queue_proc;
mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
@@ -4681,15 +4996,16 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
- atomic_inc(&ctx->cq_timeouts);
-
spin_lock_irqsave(&ctx->completion_lock, flags);
+ atomic_set(&req->ctx->cq_timeouts,
+ atomic_read(&req->ctx->cq_timeouts) + 1);
+
/*
* We could be racing with timeout deletion. If the list is empty,
* then timeout lookup already found it and will be handling it.
*/
- if (!list_empty(&req->list))
- list_del_init(&req->list);
+ if (!list_empty(&req->timeout.list))
+ list_del_init(&req->timeout.list);
io_cqring_fill_event(req, -ETIME);
io_commit_cqring(ctx);
@@ -4706,9 +5022,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
struct io_kiocb *req;
int ret = -ENOENT;
- list_for_each_entry(req, &ctx->timeout_list, list) {
+ list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
if (user_data == req->user_data) {
- list_del_init(&req->list);
+ list_del_init(&req->timeout.list);
ret = 0;
break;
}
@@ -4732,7 +5048,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->buf_index || sqe->len)
return -EINVAL;
req->timeout.addr = READ_ONCE(sqe->addr);
@@ -4788,7 +5106,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
data = &req->io->timeout;
data->req = req;
- req->flags |= REQ_F_TIMEOUT;
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
@@ -4816,8 +5133,7 @@ static int io_timeout(struct io_kiocb *req)
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
- if (!off) {
- req->flags |= REQ_F_TIMEOUT_NOSEQ;
+ if (io_is_timeout_noseq(req)) {
entry = ctx->timeout_list.prev;
goto add;
}
@@ -4830,16 +5146,17 @@ static int io_timeout(struct io_kiocb *req)
* the one we need first.
*/
list_for_each_prev(entry, &ctx->timeout_list) {
- struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
+ struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
+ timeout.list);
- if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
+ if (io_is_timeout_noseq(nxt))
continue;
/* nxt.seq is behind @tail, otherwise would've been completed */
if (off >= nxt->timeout.target_seq - tail)
break;
}
add:
- list_add(&req->list, entry);
+ list_add(&req->timeout.list, entry);
data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
spin_unlock_irq(&ctx->completion_lock);
@@ -4910,8 +5227,9 @@ static int io_async_cancel_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
- sqe->cancel_flags)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr);
@@ -4929,7 +5247,9 @@ static int io_async_cancel(struct io_kiocb *req)
static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->flags || sqe->ioprio || sqe->rw_flags)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
req->files_update.offset = READ_ONCE(sqe->off);
@@ -4940,7 +5260,8 @@ static int io_files_update_prep(struct io_kiocb *req,
return 0;
}
-static int io_files_update(struct io_kiocb *req, bool force_nonblock)
+static int io_files_update(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_files_update up;
@@ -4958,8 +5279,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock)
if (ret < 0)
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ __io_req_complete(req, ret, 0, cs);
return 0;
}
@@ -4971,15 +5291,11 @@ static int io_req_defer_prep(struct io_kiocb *req,
if (!sqe)
return 0;
- io_req_init_async(req);
-
- if (io_op_defs[req->opcode].file_table) {
- ret = io_grab_files(req);
- if (unlikely(ret))
- return ret;
- }
-
- io_req_work_grab_env(req, &io_op_defs[req->opcode]);
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
+ ret = io_prep_work_files(req);
+ if (unlikely(ret))
+ return ret;
switch (req->opcode) {
case IORING_OP_NOP:
@@ -5081,86 +5397,117 @@ static int io_req_defer_prep(struct io_kiocb *req,
return ret;
}
+static u32 io_get_sequence(struct io_kiocb *req)
+{
+ struct io_kiocb *pos;
+ struct io_ring_ctx *ctx = req->ctx;
+ u32 total_submitted, nr_reqs = 1;
+
+ if (req->flags & REQ_F_LINK_HEAD)
+ list_for_each_entry(pos, &req->link_list, link_list)
+ nr_reqs++;
+
+ total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
+ return total_submitted - nr_reqs;
+}
+
static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_defer_entry *de;
int ret;
+ u32 seq;
/* Still need defer if there is pending req in defer list. */
- if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
+ if (likely(list_empty_careful(&ctx->defer_list) &&
+ !(req->flags & REQ_F_IO_DRAIN)))
+ return 0;
+
+ seq = io_get_sequence(req);
+ /* Still a chance to pass the sequence check */
+ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
return 0;
if (!req->io) {
- if (io_alloc_async_ctx(req))
- return -EAGAIN;
ret = io_req_defer_prep(req, sqe);
- if (ret < 0)
+ if (ret)
return ret;
}
+ io_prep_async_link(req);
+ de = kmalloc(sizeof(*de), GFP_KERNEL);
+ if (!de)
+ return -ENOMEM;
spin_lock_irq(&ctx->completion_lock);
- if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
+ if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock);
- return 0;
+ kfree(de);
+ io_queue_async_work(req);
+ return -EIOCBQUEUED;
}
trace_io_uring_defer(ctx, req, req->user_data);
- list_add_tail(&req->list, &ctx->defer_list);
+ de->req = req;
+ de->seq = seq;
+ list_add_tail(&de->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
return -EIOCBQUEUED;
}
-static void io_cleanup_req(struct io_kiocb *req)
+static void __io_clean_op(struct io_kiocb *req)
{
struct io_async_ctx *io = req->io;
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- if (req->flags & REQ_F_BUFFER_SELECTED)
+ if (req->flags & REQ_F_BUFFER_SELECTED) {
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
kfree((void *)(unsigned long)req->rw.addr);
- /* fallthrough */
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- if (io->rw.iov != io->rw.fast_iov)
- kfree(io->rw.iov);
- break;
- case IORING_OP_RECVMSG:
- if (req->flags & REQ_F_BUFFER_SELECTED)
- kfree(req->sr_msg.kbuf);
- /* fallthrough */
- case IORING_OP_SENDMSG:
- if (io->msg.iov != io->msg.fast_iov)
- kfree(io->msg.iov);
- break;
- case IORING_OP_RECV:
- if (req->flags & REQ_F_BUFFER_SELECTED)
+ break;
+ case IORING_OP_RECVMSG:
+ case IORING_OP_RECV:
kfree(req->sr_msg.kbuf);
- break;
- case IORING_OP_OPENAT:
- case IORING_OP_OPENAT2:
- break;
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- io_put_file(req, req->splice.file_in,
- (req->splice.flags & SPLICE_F_FD_IN_FIXED));
- break;
+ break;
+ }
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ }
+
+ if (req->flags & REQ_F_NEED_CLEANUP) {
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
+ if (io->rw.iov != io->rw.fast_iov)
+ kfree(io->rw.iov);
+ break;
+ case IORING_OP_RECVMSG:
+ case IORING_OP_SENDMSG:
+ if (io->msg.iov != io->msg.fast_iov)
+ kfree(io->msg.iov);
+ break;
+ case IORING_OP_SPLICE:
+ case IORING_OP_TEE:
+ io_put_file(req, req->splice.file_in,
+ (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+ break;
+ }
+ req->flags &= ~REQ_F_NEED_CLEANUP;
}
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
}
static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+ bool force_nonblock, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
switch (req->opcode) {
case IORING_OP_NOP:
- ret = io_nop(req);
+ ret = io_nop(req, cs);
break;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
@@ -5170,7 +5517,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_read(req, force_nonblock);
+ ret = io_read(req, force_nonblock, cs);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
@@ -5180,7 +5527,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_write(req, force_nonblock);
+ ret = io_write(req, force_nonblock, cs);
break;
case IORING_OP_FSYNC:
if (sqe) {
@@ -5222,9 +5569,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_SENDMSG)
- ret = io_sendmsg(req, force_nonblock);
+ ret = io_sendmsg(req, force_nonblock, cs);
else
- ret = io_send(req, force_nonblock);
+ ret = io_send(req, force_nonblock, cs);
break;
case IORING_OP_RECVMSG:
case IORING_OP_RECV:
@@ -5234,9 +5581,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_RECVMSG)
- ret = io_recvmsg(req, force_nonblock);
+ ret = io_recvmsg(req, force_nonblock, cs);
else
- ret = io_recv(req, force_nonblock);
+ ret = io_recv(req, force_nonblock, cs);
break;
case IORING_OP_TIMEOUT:
if (sqe) {
@@ -5260,7 +5607,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_accept(req, force_nonblock);
+ ret = io_accept(req, force_nonblock, cs);
break;
case IORING_OP_CONNECT:
if (sqe) {
@@ -5268,7 +5615,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_connect(req, force_nonblock);
+ ret = io_connect(req, force_nonblock, cs);
break;
case IORING_OP_ASYNC_CANCEL:
if (sqe) {
@@ -5300,7 +5647,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_close(req, force_nonblock);
+ ret = io_close(req, force_nonblock, cs);
break;
case IORING_OP_FILES_UPDATE:
if (sqe) {
@@ -5308,7 +5655,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_files_update(req, force_nonblock);
+ ret = io_files_update(req, force_nonblock, cs);
break;
case IORING_OP_STATX:
if (sqe) {
@@ -5348,7 +5695,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_epoll_ctl(req, force_nonblock);
+ ret = io_epoll_ctl(req, force_nonblock, cs);
break;
case IORING_OP_SPLICE:
if (sqe) {
@@ -5364,7 +5711,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_provide_buffers(req, force_nonblock);
+ ret = io_provide_buffers(req, force_nonblock, cs);
break;
case IORING_OP_REMOVE_BUFFERS:
if (sqe) {
@@ -5372,7 +5719,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_remove_buffers(req, force_nonblock);
+ ret = io_remove_buffers(req, force_nonblock, cs);
break;
case IORING_OP_TEE:
if (sqe) {
@@ -5407,25 +5754,15 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
-static void io_arm_async_linked_timeout(struct io_kiocb *req)
-{
- struct io_kiocb *link;
-
- /* link head's timeout is queued in io_queue_async_work() */
- if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
- return;
-
- link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- io_queue_linked_timeout(link);
-}
-
-static void io_wq_submit_work(struct io_wq_work **workptr)
+static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
{
- struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_kiocb *timeout;
int ret = 0;
- io_arm_async_linked_timeout(req);
+ timeout = io_prep_linked_timeout(req);
+ if (timeout)
+ io_queue_linked_timeout(timeout);
/* if NO_CANCEL is set, we must still run the work */
if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
@@ -5435,7 +5772,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
if (!ret) {
do {
- ret = io_issue_sqe(req, NULL, false);
+ ret = io_issue_sqe(req, NULL, false, NULL);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -5449,11 +5786,10 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
if (ret) {
req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
+ io_req_complete(req, ret);
}
- io_steal_work(req, workptr);
+ return io_steal_work(req);
}
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -5510,6 +5846,8 @@ static int io_grab_files(struct io_kiocb *req)
int ret = -EBADF;
struct io_ring_ctx *ctx = req->ctx;
+ io_req_init_async(req);
+
if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
return 0;
if (!ctx->ring_file)
@@ -5535,6 +5873,13 @@ static int io_grab_files(struct io_kiocb *req)
return ret;
}
+static inline int io_prep_work_files(struct io_kiocb *req)
+{
+ if (!io_op_defs[req->opcode].file_table)
+ return 0;
+ return io_grab_files(req);
+}
+
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
@@ -5567,8 +5912,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
io_put_req(prev);
} else {
- io_cqring_add_event(req, -ETIME);
- io_put_req(req);
+ io_req_complete(req, -ETIME);
}
return HRTIMER_NORESTART;
}
@@ -5601,8 +5945,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
if (!(req->flags & REQ_F_LINK_HEAD))
return NULL;
- /* for polled retry, if flag is set, we already went through here */
- if (req->flags & REQ_F_POLLED)
+ if (req->flags & REQ_F_LINK_TIMEOUT)
return NULL;
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
@@ -5614,7 +5957,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
return nxt;
}
-static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_comp_state *cs)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt;
@@ -5634,54 +5978,45 @@ again:
old_creds = override_creds(req->work.creds);
}
- ret = io_issue_sqe(req, sqe, true);
+ ret = io_issue_sqe(req, sqe, true, cs);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
- (req->flags & REQ_F_MUST_PUNT))) {
- if (io_arm_poll_handler(req)) {
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
- goto exit;
- }
+ if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+ if (!io_arm_poll_handler(req)) {
punt:
- io_req_init_async(req);
-
- if (io_op_defs[req->opcode].file_table) {
- ret = io_grab_files(req);
- if (ret)
+ ret = io_prep_work_files(req);
+ if (unlikely(ret))
goto err;
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req);
}
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
goto exit;
}
+ if (unlikely(ret)) {
err:
- nxt = NULL;
- /* drop submission reference */
- io_put_req_find_next(req, &nxt);
-
- if (linked_timeout) {
- if (!ret)
- io_queue_linked_timeout(linked_timeout);
- else
- io_put_req(linked_timeout);
- }
-
- /* and drop final reference, if we failed */
- if (ret) {
- io_cqring_add_event(req, ret);
+ /* un-prep timeout, so it'll be killed as any other linked */
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
req_set_fail_links(req);
io_put_req(req);
+ io_req_complete(req, ret);
+ goto exit;
}
+
+ /* drop submission reference */
+ nxt = io_put_req_find_next(req);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+
if (nxt) {
req = nxt;
@@ -5694,7 +6029,8 @@ exit:
revert_creds(old_creds);
}
-static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_comp_state *cs)
{
int ret;
@@ -5702,17 +6038,14 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (ret) {
if (ret != -EIOCBQUEUED) {
fail_req:
- io_cqring_add_event(req, ret);
req_set_fail_links(req);
- io_double_put_req(req);
+ io_put_req(req);
+ io_req_complete(req, ret);
}
} else if (req->flags & REQ_F_FORCE_ASYNC) {
if (!req->io) {
- ret = -EAGAIN;
- if (io_alloc_async_ctx(req))
- goto fail_req;
ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret < 0))
+ if (unlikely(ret))
goto fail_req;
}
@@ -5720,24 +6053,26 @@ fail_req:
* Never try inline submit of IOSQE_ASYNC is set, go straight
* to async execution.
*/
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
- __io_queue_sqe(req, sqe);
+ __io_queue_sqe(req, sqe, cs);
}
}
-static inline void io_queue_link_head(struct io_kiocb *req)
+static inline void io_queue_link_head(struct io_kiocb *req,
+ struct io_comp_state *cs)
{
if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
- io_cqring_add_event(req, -ECANCELED);
- io_double_put_req(req);
+ io_put_req(req);
+ io_req_complete(req, -ECANCELED);
} else
- io_queue_sqe(req, NULL);
+ io_queue_sqe(req, NULL, cs);
}
static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **link)
+ struct io_kiocb **link, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -5763,21 +6098,19 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
- if (io_alloc_async_ctx(req))
- return -EAGAIN;
-
ret = io_req_defer_prep(req, sqe);
- if (ret) {
+ if (unlikely(ret)) {
/* fail even hard links since we don't submit */
head->flags |= REQ_F_FAIL_LINK;
return ret;
}
trace_io_uring_link(ctx, req, head);
+ io_get_req_task(req);
list_add_tail(&req->link_list, &head->link_list);
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- io_queue_link_head(head);
+ io_queue_link_head(head, cs);
*link = NULL;
}
} else {
@@ -5789,15 +6122,12 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->flags |= REQ_F_LINK_HEAD;
INIT_LIST_HEAD(&req->link_list);
- if (io_alloc_async_ctx(req))
- return -EAGAIN;
-
ret = io_req_defer_prep(req, sqe);
- if (ret)
+ if (unlikely(ret))
req->flags |= REQ_F_FAIL_LINK;
*link = req;
} else {
- io_queue_sqe(req, sqe);
+ io_queue_sqe(req, sqe, cs);
}
}
@@ -5809,6 +6139,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
*/
static void io_submit_state_end(struct io_submit_state *state)
{
+ if (!list_empty(&state->comp.list))
+ io_submit_flush_completions(&state->comp);
blk_finish_plug(&state->plug);
io_state_file_put(state);
if (state->free_reqs)
@@ -5819,9 +6151,15 @@ static void io_submit_state_end(struct io_submit_state *state)
* Start submission side cache.
*/
static void io_submit_state_start(struct io_submit_state *state,
- unsigned int max_ios)
+ struct io_ring_ctx *ctx, unsigned int max_ios)
{
blk_start_plug(&state->plug);
+#ifdef CONFIG_BLOCK
+ state->plug.nowait = true;
+#endif
+ state->comp.nr = 0;
+ INIT_LIST_HEAD(&state->comp.list);
+ state->comp.ctx = ctx;
state->free_reqs = 0;
state->file = NULL;
state->ios_left = max_ios;
@@ -5886,12 +6224,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
unsigned int sqe_flags;
int id;
- /*
- * All io need record the previous position, if LINK vs DARIN,
- * it can be used to mark the position of the first IO in the
- * link list.
- */
- req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
req->opcode = READ_ONCE(sqe->opcode);
req->user_data = READ_ONCE(sqe->user_data);
req->io = NULL;
@@ -5939,7 +6271,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
struct file *ring_file, int ring_fd)
{
- struct io_submit_state state, *statep = NULL;
+ struct io_submit_state state;
struct io_kiocb *link = NULL;
int i, submitted = 0;
@@ -5956,10 +6288,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
- if (nr > IO_PLUG_THRESHOLD) {
- io_submit_state_start(&state, nr);
- statep = &state;
- }
+ io_submit_state_start(&state, ctx, nr);
ctx->ring_fd = ring_fd;
ctx->ring_file = ring_file;
@@ -5974,28 +6303,28 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
io_consume_sqe(ctx);
break;
}
- req = io_alloc_req(ctx, statep);
+ req = io_alloc_req(ctx, &state);
if (unlikely(!req)) {
if (!submitted)
submitted = -EAGAIN;
break;
}
- err = io_init_req(ctx, req, sqe, statep);
+ err = io_init_req(ctx, req, sqe, &state);
io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
if (unlikely(err)) {
fail_req:
- io_cqring_add_event(req, err);
- io_double_put_req(req);
+ io_put_req(req);
+ io_req_complete(req, err);
break;
}
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, io_async_submit(ctx));
- err = io_submit_sqe(req, sqe, &link);
+ err = io_submit_sqe(req, sqe, &link, &state.comp);
if (err)
goto fail_req;
}
@@ -6006,9 +6335,8 @@ fail_req:
percpu_ref_put_many(&ctx->refs, nr - ref_used);
}
if (link)
- io_queue_link_head(link);
- if (statep)
- io_submit_state_end(&state);
+ io_queue_link_head(link, &state.comp);
+ io_submit_state_end(&state);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -6016,6 +6344,21 @@ fail_req:
return submitted;
}
+static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
+{
+ /* Tell userspace we may need a wakeup call */
+ spin_lock_irq(&ctx->completion_lock);
+ ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
+}
+
+static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
+{
+ spin_lock_irq(&ctx->completion_lock);
+ ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
+}
+
static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
@@ -6032,12 +6375,12 @@ static int io_sq_thread(void *data)
while (!kthread_should_park()) {
unsigned int to_submit;
- if (!list_empty(&ctx->poll_list)) {
+ if (!list_empty(&ctx->iopoll_list)) {
unsigned nr_events = 0;
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->poll_list))
- io_iopoll_getevents(ctx, &nr_events, 0);
+ if (!list_empty(&ctx->iopoll_list) && !need_resched())
+ io_do_iopoll(ctx, &nr_events, 0);
else
timeout = jiffies + ctx->sq_thread_idle;
mutex_unlock(&ctx->uring_lock);
@@ -6056,7 +6399,7 @@ static int io_sq_thread(void *data)
* adding ourselves to the waitqueue, as the unuse/drop
* may sleep.
*/
- io_sq_thread_drop_mm(ctx);
+ io_sq_thread_drop_mm();
/*
* We're polling. If we're within the defined idle
@@ -6065,11 +6408,10 @@ static int io_sq_thread(void *data)
* more IO, we should wait for the application to
* reap events and wake us up.
*/
- if (!list_empty(&ctx->poll_list) || need_resched() ||
+ if (!list_empty(&ctx->iopoll_list) || need_resched() ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
- if (current->task_works)
- task_work_run();
+ io_run_task_work();
cond_resched();
continue;
}
@@ -6079,21 +6421,18 @@ static int io_sq_thread(void *data)
/*
* While doing polled IO, before going to sleep, we need
- * to check if there are new reqs added to poll_list, it
- * is because reqs may have been punted to io worker and
- * will be added to poll_list later, hence check the
- * poll_list again.
+ * to check if there are new reqs added to iopoll_list,
+ * it is because reqs may have been punted to io worker
+ * and will be added to iopoll_list later, hence check
+ * the iopoll_list again.
*/
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->poll_list)) {
+ !list_empty_careful(&ctx->iopoll_list)) {
finish_wait(&ctx->sqo_wait, &wait);
continue;
}
- /* Tell userspace we may need a wakeup call */
- spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
- spin_unlock_irq(&ctx->completion_lock);
+ io_ring_set_wakeup_flag(ctx);
to_submit = io_sqring_entries(ctx);
if (!to_submit || ret == -EBUSY) {
@@ -6101,9 +6440,9 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
break;
}
- if (current->task_works) {
- task_work_run();
+ if (io_run_task_work()) {
finish_wait(&ctx->sqo_wait, &wait);
+ io_ring_clear_wakeup_flag(ctx);
continue;
}
if (signal_pending(current))
@@ -6111,17 +6450,13 @@ static int io_sq_thread(void *data)
schedule();
finish_wait(&ctx->sqo_wait, &wait);
- spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
- spin_unlock_irq(&ctx->completion_lock);
+ io_ring_clear_wakeup_flag(ctx);
ret = 0;
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
- spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
- spin_unlock_irq(&ctx->completion_lock);
+ io_ring_clear_wakeup_flag(ctx);
}
mutex_lock(&ctx->uring_lock);
@@ -6131,10 +6466,9 @@ static int io_sq_thread(void *data)
timeout = jiffies + ctx->sq_thread_idle;
}
- if (current->task_works)
- task_work_run();
+ io_run_task_work();
- io_sq_thread_drop_mm(ctx);
+ io_sq_thread_drop_mm();
revert_creds(old_cred);
kthread_parkme();
@@ -6197,9 +6531,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
if (io_cqring_events(ctx, false) >= min_events)
return 0;
- if (!current->task_works)
+ if (!io_run_task_work())
break;
- task_work_run();
} while (1);
if (sig) {
@@ -6221,8 +6554,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
/* make sure we run task_work before checking for signals */
- if (current->task_works)
- task_work_run();
+ if (io_run_task_work())
+ continue;
if (signal_pending(current)) {
if (current->jobctl & JOBCTL_TASK_WORK) {
spin_lock_irq(&current->sighand->siglock);
@@ -7008,17 +7341,21 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
return 0;
err:
io_finish_async(ctx);
- mmdrop(ctx->sqo_mm);
- ctx->sqo_mm = NULL;
+ if (ctx->sqo_mm) {
+ mmdrop(ctx->sqo_mm);
+ ctx->sqo_mm = NULL;
+ }
return ret;
}
-static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
+static inline void __io_unaccount_mem(struct user_struct *user,
+ unsigned long nr_pages)
{
atomic_long_sub(nr_pages, &user->locked_vm);
}
-static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
+static inline int __io_account_mem(struct user_struct *user,
+ unsigned long nr_pages)
{
unsigned long page_limit, cur_pages, new_pages;
@@ -7036,6 +7373,41 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
return 0;
}
+static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
+ enum io_mem_account acct)
+{
+ if (ctx->limit_mem)
+ __io_unaccount_mem(ctx->user, nr_pages);
+
+ if (ctx->sqo_mm) {
+ if (acct == ACCT_LOCKED)
+ ctx->sqo_mm->locked_vm -= nr_pages;
+ else if (acct == ACCT_PINNED)
+ atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm);
+ }
+}
+
+static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
+ enum io_mem_account acct)
+{
+ int ret;
+
+ if (ctx->limit_mem) {
+ ret = __io_account_mem(ctx->user, nr_pages);
+ if (ret)
+ return ret;
+ }
+
+ if (ctx->sqo_mm) {
+ if (acct == ACCT_LOCKED)
+ ctx->sqo_mm->locked_vm += nr_pages;
+ else if (acct == ACCT_PINNED)
+ atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm);
+ }
+
+ return 0;
+}
+
static void io_mem_free(void *ptr)
{
struct page *page;
@@ -7072,6 +7444,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
return SIZE_MAX;
#endif
+ if (sq_offset)
+ *sq_offset = off;
+
sq_array_size = array_size(sizeof(u32), sq_entries);
if (sq_array_size == SIZE_MAX)
return SIZE_MAX;
@@ -7079,9 +7454,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
if (check_add_overflow(off, sq_array_size, &off))
return SIZE_MAX;
- if (sq_offset)
- *sq_offset = off;
-
return off;
}
@@ -7110,8 +7482,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
for (j = 0; j < imu->nr_bvecs; j++)
unpin_user_page(imu->bvec[j].bv_page);
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user, imu->nr_bvecs);
+ io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
kvfree(imu->bvec);
imu->nr_bvecs = 0;
}
@@ -7194,11 +7565,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
- if (ctx->account_mem) {
- ret = io_account_mem(ctx->user, nr_pages);
- if (ret)
- goto err;
- }
+ ret = io_account_mem(ctx, nr_pages, ACCT_PINNED);
+ if (ret)
+ goto err;
ret = 0;
if (!pages || nr_pages > got_pages) {
@@ -7211,8 +7580,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
GFP_KERNEL);
if (!pages || !vmas) {
ret = -ENOMEM;
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user, nr_pages);
+ io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
goto err;
}
got_pages = nr_pages;
@@ -7222,8 +7590,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
GFP_KERNEL);
ret = -ENOMEM;
if (!imu->bvec) {
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user, nr_pages);
+ io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
goto err;
}
@@ -7254,8 +7621,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
*/
if (pret > 0)
unpin_user_pages(pages, pret);
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user, nr_pages);
+ io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
kvfree(imu->bvec);
goto err;
}
@@ -7339,11 +7705,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
- if (ctx->sqo_mm)
+ io_sqe_buffer_unregister(ctx);
+ if (ctx->sqo_mm) {
mmdrop(ctx->sqo_mm);
+ ctx->sqo_mm = NULL;
+ }
- io_iopoll_reap_events(ctx);
- io_sqe_buffer_unregister(ctx);
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
@@ -7407,11 +7774,8 @@ static int io_remove_personalities(int id, void *p, void *data)
static void io_ring_exit_work(struct work_struct *work)
{
- struct io_ring_ctx *ctx;
-
- ctx = container_of(work, struct io_ring_ctx, exit_work);
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true);
+ struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
+ exit_work);
/*
* If we're doing polled IO and end up having requests being
@@ -7419,11 +7783,11 @@ static void io_ring_exit_work(struct work_struct *work)
* we're waiting for refs to drop. We need to reap these manually,
* as nobody else will be looking for them.
*/
- while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
- io_iopoll_reap_events(ctx);
+ do {
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
- }
+ io_iopoll_try_reap_events(ctx);
+ } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
}
@@ -7439,10 +7803,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
if (ctx->io_wq)
io_wq_cancel_all(ctx->io_wq);
- io_iopoll_reap_events(ctx);
/* if we failed setting up the ctx, we might not have any rings */
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
+ io_iopoll_try_reap_events(ctx);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
/*
@@ -7450,9 +7814,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
* is closed but resources aren't reaped yet. This can cause
* spurious failure in setting up a new ring.
*/
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user,
- ring_pages(ctx->sq_entries, ctx->cq_entries));
+ io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
+ ACCT_LOCKED);
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
queue_work(system_wq, &ctx->exit_work);
@@ -7508,17 +7871,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
if (cancel_req->flags & REQ_F_OVERFLOW) {
spin_lock_irq(&ctx->completion_lock);
- list_del(&cancel_req->list);
+ list_del(&cancel_req->compl.list);
cancel_req->flags &= ~REQ_F_OVERFLOW;
- if (list_empty(&ctx->cq_overflow_list)) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
- }
- spin_unlock_irq(&ctx->completion_lock);
+ io_cqring_mark_overflow(ctx);
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
/*
* Put inflight ref and overflow ref. If that's
@@ -7641,8 +8001,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
int submitted = 0;
struct fd f;
- if (current->task_works)
- task_work_run();
+ io_run_task_work();
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
return -EINVAL;
@@ -7681,8 +8040,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
- unsigned nr_events = 0;
-
min_complete = min(min_complete, ctx->cq_entries);
/*
@@ -7693,7 +8050,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
if (ctx->flags & IORING_SETUP_IOPOLL &&
!(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_iopoll_check(ctx, &nr_events, min_complete);
+ ret = io_iopoll_check(ctx, min_complete);
} else {
ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
}
@@ -7898,7 +8255,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
{
struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
- bool account_mem;
+ bool limit_mem;
int ret;
if (!entries)
@@ -7937,10 +8294,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
}
user = get_uid(current_user());
- account_mem = !capable(CAP_IPC_LOCK);
+ limit_mem = !capable(CAP_IPC_LOCK);
- if (account_mem) {
- ret = io_account_mem(user,
+ if (limit_mem) {
+ ret = __io_account_mem(user,
ring_pages(p->sq_entries, p->cq_entries));
if (ret) {
free_uid(user);
@@ -7950,14 +8307,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx = io_ring_ctx_alloc(p);
if (!ctx) {
- if (account_mem)
- io_unaccount_mem(user, ring_pages(p->sq_entries,
+ if (limit_mem)
+ __io_unaccount_mem(user, ring_pages(p->sq_entries,
p->cq_entries));
free_uid(user);
return -ENOMEM;
}
ctx->compat = in_compat_syscall();
- ctx->account_mem = account_mem;
ctx->user = user;
ctx->creds = get_current_cred();
@@ -7989,12 +8345,22 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
+ IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
+ IORING_FEAT_POLL_32BITS;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
goto err;
}
+
+ /*
+ * Account memory _before_ installing the file descriptor. Once
+ * the descriptor is installed, it can get closed at any time.
+ */
+ io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
+ ACCT_LOCKED);
+ ctx->limit_mem = limit_mem;
+
/*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
@@ -8278,7 +8644,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
+ BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
+ BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d634561f871a..78f5c96c76f3 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -612,9 +612,6 @@ static bool rootdir_empty(struct super_block *sb, unsigned long block)
/*
* Initialize the superblock and read the root inode.
- *
- * Note: a check_disk_change() has been done immediately prior
- * to this call, so we don't need to check again.
*/
static int isofs_fill_super(struct super_block *s, void *data, int silent)
{
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index eb8b9e233d73..2935d4c776ec 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -36,6 +36,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 66acea9d878b..bde787c354fc 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -6,6 +6,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
diff --git a/fs/locks.c b/fs/locks.c
index 7df0f9fa66f4..938fe325bc54 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1282,6 +1282,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
if (!new_fl)
goto out;
locks_copy_lock(new_fl, request);
+ locks_move_blocks(new_fl, request);
request = new_fl;
new_fl = NULL;
locks_insert_lock_ctx(request, &fl->fl_list);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cce2510b2cca..c9056316a0b3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -507,6 +507,17 @@ find_any_file(struct nfs4_file *f)
return ret;
}
+static struct nfsd_file *find_deleg_file(struct nfs4_file *f)
+{
+ struct nfsd_file *ret = NULL;
+
+ spin_lock(&f->fi_lock);
+ if (f->fi_deleg_file)
+ ret = nfsd_file_get(f->fi_deleg_file);
+ spin_unlock(&f->fi_lock);
+ return ret;
+}
+
static atomic_long_t num_delegations;
unsigned long max_delegations;
@@ -2444,6 +2455,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
oo = ols->st_stateowner;
nf = st->sc_file;
file = find_any_file(nf);
+ if (!file)
+ return 0;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2481,6 +2494,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
oo = ols->st_stateowner;
nf = st->sc_file;
file = find_any_file(nf);
+ if (!file)
+ return 0;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2513,7 +2528,9 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
ds = delegstateid(st);
nf = st->sc_file;
- file = nf->fi_deleg_file;
+ file = find_deleg_file(nf);
+ if (!file)
+ return 0;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2529,6 +2546,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_fname(s, file);
seq_printf(s, " }\n");
+ nfsd_file_put(file);
return 0;
}
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 3c4811469ae8..a87d4391e6b5 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -8,6 +8,7 @@
#include <linux/buffer_head.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include "dir.h"
#include "aops.h"
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 37d38697eaf8..837971e74109 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/blkdev.h>
static int devinfo_show(struct seq_file *f, void *v)
{
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 7b4bac91146b..bb02989d92b6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -78,6 +78,7 @@
#include <linux/namei.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include "../internal.h" /* ugh */
#include <linux/uaccess.h>
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index ff336513c254..155b82870333 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -15,6 +15,7 @@
#include "reiserfs.h"
#include <linux/init.h>
#include <linux/proc_fs.h>
+#include <linux/blkdev.h>
/*
* LOCKING:
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 64f61330564a..76bb1c846845 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -175,7 +175,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
/* Extract the length of the metadata block */
data = page_address(bvec->bv_page) + bvec->bv_offset;
length = data[offset];
- if (offset <= bvec->bv_len - 1) {
+ if (offset < bvec->bv_len - 1) {
length |= data[offset + 1] << 8;
} else {
if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 52de29000c7e..6e264dded46e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -339,7 +339,6 @@ out:
return ret;
}
-/* Should pair with userfaultfd_signal_pending() */
static inline long userfaultfd_get_blocking_state(unsigned int flags)
{
if (flags & FAULT_FLAG_INTERRUPTIBLE)
@@ -351,18 +350,6 @@ static inline long userfaultfd_get_blocking_state(unsigned int flags)
return TASK_UNINTERRUPTIBLE;
}
-/* Should pair with userfaultfd_get_blocking_state() */
-static inline bool userfaultfd_signal_pending(unsigned int flags)
-{
- if (flags & FAULT_FLAG_INTERRUPTIBLE)
- return signal_pending(current);
-
- if (flags & FAULT_FLAG_KILLABLE)
- return fatal_signal_pending(current);
-
- return false;
-}
-
/*
* The locking rules involved in returning VM_FAULT_RETRY depending on
* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
@@ -516,33 +503,9 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
vmf->flags, reason);
mmap_read_unlock(mm);
- if (likely(must_wait && !READ_ONCE(ctx->released) &&
- !userfaultfd_signal_pending(vmf->flags))) {
+ if (likely(must_wait && !READ_ONCE(ctx->released))) {
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
schedule();
- ret |= VM_FAULT_MAJOR;
-
- /*
- * False wakeups can orginate even from rwsem before
- * up_read() however userfaults will wait either for a
- * targeted wakeup on the specific uwq waitqueue from
- * wake_userfault() or for signals or for uffd
- * release.
- */
- while (!READ_ONCE(uwq.waken)) {
- /*
- * This needs the full smp_store_mb()
- * guarantee as the state write must be
- * visible to other CPUs before reading
- * uwq.waken from other CPUs.
- */
- set_current_state(blocking_state);
- if (READ_ONCE(uwq.waken) ||
- READ_ONCE(ctx->released) ||
- userfaultfd_signal_pending(vmf->flags))
- break;
- schedule();
- }
}
__set_current_state(TASK_RUNNING);
diff --git a/fs/verity/open.c b/fs/verity/open.c
index d007db0c9304..bfe0280c14e4 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -221,11 +221,20 @@ out:
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
{
/*
- * Multiple processes may race to set ->i_verity_info, so use cmpxchg.
- * This pairs with the READ_ONCE() in fsverity_get_info().
+ * Multiple tasks may race to set ->i_verity_info, so use
+ * cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * fsverity_get_info(). I.e., here we publish ->i_verity_info with a
+ * RELEASE barrier so that other tasks can ACQUIRE it.
*/
- if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL)
+ if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) {
+ /* Lost the race, so free the fsverity_info we allocated. */
fsverity_free_info(vi);
+ /*
+ * Afterwards, the caller may access ->i_verity_info directly,
+ * so make sure to ACQUIRE the winning fsverity_info.
+ */
+ (void)fsverity_get_info(inode);
+ }
}
void fsverity_free_info(struct fsverity_info *vi)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 00db81eac80d..fdbff4860d61 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1080,7 +1080,7 @@ xfs_file_open(
return -EFBIG;
if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT;
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return 0;
}
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
index 4bcc3e61056c..b03333f1c84a 100644
--- a/fs/xfs/xfs_pwork.c
+++ b/fs/xfs/xfs_pwork.c
@@ -132,5 +132,5 @@ xfs_pwork_guess_datadev_parallelism(
* For now we'll go with the most conservative setting possible,
* which is two threads for an SSD and 1 thread everywhere else.
*/
- return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1;
+ return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1;
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 07bc42d62673..abfb17f88f9a 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -607,14 +607,14 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
int nr_pages;
ssize_t ret;
- nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
- if (!nr_pages)
- return 0;
-
max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
iov_iter_truncate(from, max);
+ nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
+ if (!nr_pages)
+ return 0;
+
bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
if (!bio)
return -ENOMEM;
@@ -1119,7 +1119,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
char *file_name;
struct dentry *dir;
unsigned int n = 0;
- int ret = -ENOMEM;
+ int ret;
/* If the group is empty, there is nothing to do */
if (!zd->nr_zones[type])
@@ -1135,8 +1135,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
zgroup_name = "seq";
dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
- if (!dir)
+ if (!dir) {
+ ret = -ENOMEM;
goto free;
+ }
/*
* The first zone contains the super block: skip it.
@@ -1174,8 +1176,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
* Use the file number within its group as file name.
*/
snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
- if (!zonefs_create_inode(dir, file_name, zone, type))
+ if (!zonefs_create_inode(dir, file_name, zone, type)) {
+ ret = -ENOMEM;
goto free;
+ }
n++;
}