diff options
Diffstat (limited to 'fs')
86 files changed, 4211 insertions, 2684 deletions
diff --git a/fs/adfs/super.c b/fs/adfs/super.c index a3cc8ecb50da..d553bb5bc17a 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/statfs.h> #include <linux/user_namespace.h> +#include <linux/blkdev.h> #include "adfs.h" #include "dir_f.h" #include "dir_fplus.h" diff --git a/fs/affs/file.c b/fs/affs/file.c index a85817f54483..a26a0f96c119 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -14,6 +14,7 @@ */ #include <linux/uio.h> +#include <linux/blkdev.h> #include "affs.h" static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 64cdf4d8e424..2482032021ca 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -22,6 +22,7 @@ #include <linux/cred.h> #include <linux/exportfs.h> #include <linux/seq_file.h> +#include <linux/blkdev.h> #include "befs.h" #include "btree.h" diff --git a/fs/block_dev.c b/fs/block_dev.c index 0ae656e022fd..8ae833e00443 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -105,16 +105,7 @@ EXPORT_SYMBOL(invalidate_bdev); static void set_init_blocksize(struct block_device *bdev) { - unsigned bsize = bdev_logical_block_size(bdev); - loff_t size = i_size_read(bdev->bd_inode); - - while (bsize < PAGE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } - bdev->bd_block_size = bsize; - bdev->bd_inode->i_blkbits = blksize_bits(bsize); + bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); } int set_blocksize(struct block_device *bdev, int size) @@ -128,9 +119,8 @@ int set_blocksize(struct block_device *bdev, int size) return -EINVAL; /* Don't change the size if it is same as current */ - if (bdev->bd_block_size != size) { + if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { sync_blockdev(bdev); - bdev->bd_block_size = size; bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } @@ -703,12 +693,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, 0); + result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, REQ_OP_READ); - blk_queue_exit(bdev->bd_queue); + blk_queue_exit(bdev->bd_disk->queue); return result; } @@ -739,7 +729,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, 0); + result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; @@ -752,7 +742,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, clean_page_buffers(page); unlock_page(page); } - blk_queue_exit(bdev->bd_queue); + blk_queue_exit(bdev->bd_disk->queue); return result; } @@ -783,7 +773,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); - INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif @@ -799,9 +788,6 @@ static void bdev_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); - spin_lock(&bdev_lock); - list_del_init(&bdev->bd_list); - spin_unlock(&bdev_lock); /* Detach inode from wb early as bdi_put() may free bdi->wb */ inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { @@ -876,8 +862,6 @@ static int bdev_set(struct inode *inode, void *data) return 0; } -static LIST_HEAD(all_bdevs); - struct block_device *bdget(dev_t dev) { struct block_device *bdev; @@ -895,7 +879,6 @@ struct block_device *bdget(dev_t dev) bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; - bdev->bd_block_size = i_blocksize(inode); bdev->bd_part_count = 0; bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; @@ -903,9 +886,6 @@ struct block_device *bdget(dev_t dev) inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); - spin_lock(&bdev_lock); - list_add(&bdev->bd_list, &all_bdevs); - spin_unlock(&bdev_lock); unlock_new_inode(inode); } return bdev; @@ -926,13 +906,14 @@ EXPORT_SYMBOL(bdgrab); long nr_blockdev_pages(void) { - struct block_device *bdev; + struct inode *inode; long ret = 0; - spin_lock(&bdev_lock); - list_for_each_entry(bdev, &all_bdevs, bd_list) { - ret += bdev->bd_inode->i_mapping->nrpages; - } - spin_unlock(&bdev_lock); + + spin_lock(&blockdev_superblock->s_inode_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) + ret += inode->i_mapping->nrpages; + spin_unlock(&blockdev_superblock->s_inode_list_lock); + return ret; } @@ -1034,30 +1015,28 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, } /** - * bd_prepare_to_claim - prepare to claim a block device + * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @whole: the whole device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * - * Prepare to claim @bdev. This function fails if @bdev is already - * claimed by another holder and waits if another claiming is in - * progress. This function doesn't actually claim. On successful - * return, the caller has ownership of bd_claiming and bd_holder[s]. - * - * CONTEXT: - * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab - * it multiple times. + * Claim @bdev. This function fails if @bdev is already claimed by another + * holder and waits if another claiming is in progress. return, the caller + * has ownership of bd_claiming and bd_holder[s]. * * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ -static int bd_prepare_to_claim(struct block_device *bdev, - struct block_device *whole, void *holder) +int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, + void *holder) { retry: + spin_lock(&bdev_lock); /* if someone else claimed, fail */ - if (!bd_may_claim(bdev, whole, holder)) + if (!bd_may_claim(bdev, whole, holder)) { + spin_unlock(&bdev_lock); return -EBUSY; + } /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { @@ -1068,13 +1047,15 @@ retry: spin_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); - spin_lock(&bdev_lock); goto retry; } /* yay, all mine */ + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); return 0; } +EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) { @@ -1097,78 +1078,6 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) return disk; } -/** - * bd_start_claiming - start claiming a block device - * @bdev: block device of interest - * @holder: holder trying to claim @bdev - * - * @bdev is about to be opened exclusively. Check @bdev can be opened - * exclusively and mark that an exclusive open is in progress. Each - * successful call to this function must be matched with a call to - * either bd_finish_claiming() or bd_abort_claiming() (which do not - * fail). - * - * This function is used to gain exclusive access to the block device - * without actually causing other exclusive open attempts to fail. It - * should be used when the open sequence itself requires exclusive - * access but may subsequently fail. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Pointer to the block device containing @bdev on success, ERR_PTR() - * value on failure. - */ -struct block_device *bd_start_claiming(struct block_device *bdev, void *holder) -{ - struct gendisk *disk; - struct block_device *whole; - int partno, err; - - might_sleep(); - - /* - * @bdev might not have been initialized properly yet, look up - * and grab the outer block device the hard way. - */ - disk = bdev_get_gendisk(bdev, &partno); - if (!disk) - return ERR_PTR(-ENXIO); - - /* - * Normally, @bdev should equal what's returned from bdget_disk() - * if partno is 0; however, some drivers (floppy) use multiple - * bdev's for the same physical device and @bdev may be one of the - * aliases. Keep @bdev if partno is 0. This means claimer - * tracking is broken for those devices but it has always been that - * way. - */ - if (partno) - whole = bdget_disk(disk, 0); - else - whole = bdgrab(bdev); - - put_disk_and_module(disk); - if (!whole) - return ERR_PTR(-ENOMEM); - - /* prepare to claim, if successful, mark claiming in progress */ - spin_lock(&bdev_lock); - - err = bd_prepare_to_claim(bdev, whole, holder); - if (err == 0) { - whole->bd_claiming = holder; - spin_unlock(&bdev_lock); - return whole; - } else { - spin_unlock(&bdev_lock); - bdput(whole); - return ERR_PTR(err); - } -} -EXPORT_SYMBOL(bd_start_claiming); - static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); @@ -1181,14 +1090,14 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) /** * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest - * @whole: whole block device (returned from bd_start_claiming()) + * @whole: whole block device * @holder: holder that has claimed @bdev * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ -void bd_finish_claiming(struct block_device *bdev, struct block_device *whole, - void *holder) +static void bd_finish_claiming(struct block_device *bdev, + struct block_device *whole, void *holder) { spin_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, whole, holder)); @@ -1203,12 +1112,11 @@ void bd_finish_claiming(struct block_device *bdev, struct block_device *whole, bd_clear_claiming(whole, holder); spin_unlock(&bdev_lock); } -EXPORT_SYMBOL(bd_finish_claiming); /** * bd_abort_claiming - abort claiming of a block device * @bdev: block device of interest - * @whole: whole block device (returned from bd_start_claiming()) + * @whole: whole block device * @holder: holder that has claimed @bdev * * Abort claiming of a block device when the exclusive open failed. This can be @@ -1368,26 +1276,6 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif /** - * flush_disk - invalidates all buffer-cache entries on a disk - * - * @bdev: struct block device to be flushed - * @kill_dirty: flag to guide handling of dirty inodes - * - * Invalidates all buffer-cache entries on a disk. It should be called - * when a disk has been changed -- either by a media change or online - * resize. - */ -static void flush_disk(struct block_device *bdev, bool kill_dirty) -{ - if (__invalidate_device(bdev, kill_dirty)) { - printk(KERN_WARNING "VFS: busy inodes on changed media or " - "resized disk %s\n", - bdev->bd_disk ? bdev->bd_disk->disk_name : ""); - } - bdev->bd_invalidated = 1; -} - -/** * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @bdev: struct bdev to adjust. @@ -1411,8 +1299,9 @@ static void check_disk_size_change(struct gendisk *disk, disk->disk_name, bdev_size, disk_size); } i_size_write(bdev->bd_inode, disk_size); - if (bdev_size > disk_size) - flush_disk(bdev, false); + if (bdev_size > disk_size && __invalidate_device(bdev, false)) + pr_warn("VFS: busy inodes on resized disk %s\n", + disk->disk_name); } bdev->bd_invalidated = 0; } @@ -1471,7 +1360,10 @@ int check_disk_change(struct block_device *bdev) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; - flush_disk(bdev, true); + if (__invalidate_device(bdev, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + disk->disk_name); + bdev->bd_invalidated = 1; if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); return 1; @@ -1547,13 +1439,15 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); * mutex_lock_nested(whole->bd_mutex, 1) */ -static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) +static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, + int for_part) { + struct block_device *whole = NULL, *claiming = NULL; struct gendisk *disk; int ret; int partno; int perm = 0; - bool first_open = false; + bool first_open = false, unblock_events = true, need_restart; if (mode & FMODE_READ) perm |= MAY_READ; @@ -1569,18 +1463,36 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } restart: - + need_restart = false; ret = -ENXIO; disk = bdev_get_gendisk(bdev, &partno); if (!disk) goto out; + if (partno) { + whole = bdget_disk(disk, 0); + if (!whole) { + ret = -ENOMEM; + goto out_put_disk; + } + } + + if (!for_part && (mode & FMODE_EXCL)) { + WARN_ON_ONCE(!holder); + if (whole) + claiming = whole; + else + claiming = bdev; + ret = bd_prepare_to_claim(bdev, claiming, holder); + if (ret) + goto out_put_whole; + } + disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { first_open = true; bdev->bd_disk = disk; - bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; bdev->bd_partno = partno; @@ -1593,20 +1505,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) ret = 0; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); - if (ret == -ERESTARTSYS) { - /* Lost a race with 'disk' being - * deleted, try again. - * See md.c - */ - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; - bdev->bd_disk = NULL; - bdev->bd_queue = NULL; - mutex_unlock(&bdev->bd_mutex); - disk_unblock_events(disk); - put_disk_and_module(disk); - goto restart; - } + /* + * If we lost a race with 'disk' being deleted, + * try again. See md.c + */ + if (ret == -ERESTARTSYS) + need_restart = true; } if (!ret) { @@ -1627,18 +1531,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_clear; } else { - struct block_device *whole; - whole = bdget_disk(disk, 0); - ret = -ENOMEM; - if (!whole) - goto out_clear; BUG_ON(for_part); - ret = __blkdev_get(whole, mode, 1); - if (ret) { - bdput(whole); + ret = __blkdev_get(whole, mode, NULL, 1); + if (ret) goto out_clear; - } - bdev->bd_contains = whole; + bdev->bd_contains = bdgrab(whole); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { @@ -1667,27 +1564,52 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_openers++; if (for_part) bdev->bd_part_count++; + if (claiming) + bd_finish_claiming(bdev, claiming, holder); + + /* + * Block event polling for write claims if requested. Any write holder + * makes the write_holder state stick until all are released. This is + * good enough and tracking individual writeable reference is too + * fragile given the way @mode is used in blkdev_get/put(). + */ + if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + bdev->bd_write_holder = true; + unblock_events = false; + } mutex_unlock(&bdev->bd_mutex); - disk_unblock_events(disk); + + if (unblock_events) + disk_unblock_events(disk); + /* only one opener holds refs to the module and disk */ if (!first_open) put_disk_and_module(disk); + if (whole) + bdput(whole); return 0; out_clear: disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; - bdev->bd_queue = NULL; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; out_unlock_bdev: + if (claiming) + bd_abort_claiming(bdev, claiming, holder); mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); + out_put_whole: + if (whole) + bdput(whole); + out_put_disk: put_disk_and_module(disk); + if (need_restart) + goto restart; out: - return ret; } @@ -1712,50 +1634,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) */ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - struct block_device *whole = NULL; int res; - WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); - - if ((mode & FMODE_EXCL) && holder) { - whole = bd_start_claiming(bdev, holder); - if (IS_ERR(whole)) { - bdput(bdev); - return PTR_ERR(whole); - } - } - - res = __blkdev_get(bdev, mode, 0); - - if (whole) { - struct gendisk *disk = whole->bd_disk; - - /* finish claiming */ - mutex_lock(&bdev->bd_mutex); - if (!res) - bd_finish_claiming(bdev, whole, holder); - else - bd_abort_claiming(bdev, whole, holder); - /* - * Block event polling for write claims if requested. Any - * write holder makes the write_holder state stick until - * all are released. This is good enough and tracking - * individual writeable reference is too fragile given the - * way @mode is used in blkdev_get/put(). - */ - if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { - bdev->bd_write_holder = true; - disk_block_events(disk); - } - - mutex_unlock(&bdev->bd_mutex); - bdput(whole); - } - + res =__blkdev_get(bdev, mode, holder, 0); if (res) bdput(bdev); - return res; } EXPORT_SYMBOL(blkdev_get); @@ -1851,7 +1734,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c037ef514b64..613920c17ac1 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -65,11 +65,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) spin_lock(&fs_info->balance_lock); target = get_restripe_target(fs_info, flags); if (target) { - /* Pick target profile only if it's already available */ - if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { - spin_unlock(&fs_info->balance_lock); - return extended_to_chunk(target); - } + spin_unlock(&fs_info->balance_lock); + return extended_to_chunk(target); } spin_unlock(&fs_info->balance_lock); @@ -118,12 +115,12 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) void btrfs_get_block_group(struct btrfs_block_group *cache) { - atomic_inc(&cache->count); + refcount_inc(&cache->refs); } void btrfs_put_block_group(struct btrfs_block_group *cache) { - if (atomic_dec_and_test(&cache->count)) { + if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); @@ -1111,7 +1108,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (ret < 0) goto out; - mutex_lock(&fs_info->chunk_mutex); spin_lock(&block_group->lock); block_group->removed = 1; /* @@ -1143,8 +1139,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, remove_em = (atomic_read(&block_group->frozen) == 0); spin_unlock(&block_group->lock); - mutex_unlock(&fs_info->chunk_mutex); - if (remove_em) { struct extent_map_tree *em_tree; @@ -1532,21 +1526,70 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, + struct btrfs_path *path) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_block_group_item bg; + struct extent_buffer *leaf; + int slot; + u64 flags; + int ret = 0; + + slot = path->slots[0]; + leaf = path->nodes[0]; + + em_tree = &fs_info->mapping_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, key->objectid, key->offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(fs_info, + "logical %llu len %llu found bg but no related chunk", + key->objectid, key->offset); + return -ENOENT; + } + + if (em->start != key->objectid || em->len != key->offset) { + btrfs_err(fs_info, + "block group %llu len %llu mismatch with chunk %llu len %llu", + key->objectid, key->offset, em->start, em->len); + ret = -EUCLEAN; + goto out_free_em; + } + + read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), + sizeof(bg)); + flags = btrfs_stack_block_group_flags(&bg) & + BTRFS_BLOCK_GROUP_TYPE_MASK; + + if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", + key->objectid, key->offset, flags, + (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); + ret = -EUCLEAN; + } + +out_free_em: + free_extent_map(em); + return ret; +} + static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key *key) { struct btrfs_root *root = fs_info->extent_root; - int ret = 0; + int ret; struct btrfs_key found_key; struct extent_buffer *leaf; - struct btrfs_block_group_item bg; - u64 flags; int slot; ret = btrfs_search_slot(NULL, root, key, path, 0, 0); if (ret < 0) - goto out; + return ret; while (1) { slot = path->slots[0]; @@ -1563,49 +1606,10 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - struct extent_map_tree *em_tree; - struct extent_map *em; - - em_tree = &root->fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, found_key.objectid, - found_key.offset); - read_unlock(&em_tree->lock); - if (!em) { - btrfs_err(fs_info, - "logical %llu len %llu found bg but no related chunk", - found_key.objectid, found_key.offset); - ret = -ENOENT; - } else if (em->start != found_key.objectid || - em->len != found_key.offset) { - btrfs_err(fs_info, - "block group %llu len %llu mismatch with chunk %llu len %llu", - found_key.objectid, found_key.offset, - em->start, em->len); - ret = -EUCLEAN; - } else { - read_extent_buffer(leaf, &bg, - btrfs_item_ptr_offset(leaf, slot), - sizeof(bg)); - flags = btrfs_stack_block_group_flags(&bg) & - BTRFS_BLOCK_GROUP_TYPE_MASK; - - if (flags != (em->map_lookup->type & - BTRFS_BLOCK_GROUP_TYPE_MASK)) { - btrfs_err(fs_info, -"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", - found_key.objectid, - found_key.offset, flags, - (BTRFS_BLOCK_GROUP_TYPE_MASK & - em->map_lookup->type)); - ret = -EUCLEAN; - } else { - ret = 0; - } - } - free_extent_map(em); - goto out; + ret = read_bg_from_eb(fs_info, &found_key, path); + break; } + path->slots[0]++; } out: @@ -1657,19 +1661,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, return -EIO; map = em->map_lookup; - data_stripe_length = em->len; + data_stripe_length = em->orig_block_len; io_stripe_size = map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID10) - data_stripe_length = div_u64(data_stripe_length, - map->num_stripes / map->sub_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - data_stripe_length = div_u64(data_stripe_length, map->num_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - data_stripe_length = div_u64(data_stripe_length, - nr_data_stripes(map)); + /* For RAID5/6 adjust to a full IO stripe length */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) io_stripe_size = map->stripe_len * nr_data_stripes(map); - } buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); if (!buf) { @@ -1748,25 +1745,12 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return ret; while (nr--) { - u64 start, len; - - if (logical[nr] > cache->start + cache->length) - continue; - - if (logical[nr] + stripe_len <= cache->start) - continue; - - start = logical[nr]; - if (start < cache->start) { - start = cache->start; - len = (logical[nr] + stripe_len) - start; - } else { - len = min_t(u64, stripe_len, - cache->start + cache->length - start); - } + u64 len = min_t(u64, stripe_len, + cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = btrfs_add_excluded_extent(fs_info, start, len); + ret = btrfs_add_excluded_extent(fs_info, logical[nr], + len); if (ret) { kfree(logical); return ret; @@ -1818,7 +1802,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; - atomic_set(&cache->count, 1); + refcount_set(&cache->refs, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); INIT_LIST_HEAD(&cache->list); @@ -2207,54 +2191,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, return 0; } -static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 num_devices; - u64 stripped; - - /* - * if restripe for this chunk_type is on pick target profile and - * return, otherwise do the usual balance - */ - stripped = get_restripe_target(fs_info, flags); - if (stripped) - return extended_to_chunk(stripped); - - num_devices = fs_info->fs_devices->rw_devices; - - stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | - BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; - - if (num_devices == 1) { - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* turn raid0 into single device chunks */ - if (flags & BTRFS_BLOCK_GROUP_RAID0) - return stripped; - - /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | - BTRFS_BLOCK_GROUP_RAID10)) - return stripped | BTRFS_BLOCK_GROUP_DUP; - } else { - /* they already had raid on here, just return */ - if (flags & stripped) - return flags; - - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* switch duplicated blocks with raid1 */ - if (flags & BTRFS_BLOCK_GROUP_DUP) - return stripped | BTRFS_BLOCK_GROUP_RAID1; - - /* this is drive concat, leave it alone */ - } - - return flags; -} - /* * Mark one block group RO, can be called several times for the same block * group. @@ -2300,7 +2236,7 @@ again: * If we are changing raid levels, try to allocate a * corresponding block group with the new raid level. */ - alloc_flags = update_block_group_flags(fs_info, cache->flags); + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); if (alloc_flags != cache->flags) { ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); @@ -2327,7 +2263,7 @@ again: ret = inc_block_group_ro(cache, 0); out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { - alloc_flags = update_block_group_flags(fs_info, cache->flags); + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); mutex_lock(&fs_info->chunk_mutex); check_system_chunk(trans, alloc_flags); mutex_unlock(&fs_info->chunk_mutex); @@ -2521,7 +2457,8 @@ again: num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); + ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, + num_pages); if (ret) goto out_put; @@ -3392,7 +3329,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ASSERT(list_empty(&block_group->dirty_list)); ASSERT(list_empty(&block_group->io_list)); ASSERT(list_empty(&block_group->bg_list)); - ASSERT(atomic_read(&block_group->count) == 1); + ASSERT(refcount_read(&block_group->refs) == 1); btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); @@ -3447,7 +3384,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) spin_unlock(&block_group->lock); if (cleanup) { - mutex_lock(&fs_info->chunk_mutex); em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, block_group->start, @@ -3455,7 +3391,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) BUG_ON(!em); /* logic error, can't happen */ remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - mutex_unlock(&fs_info->chunk_mutex); /* once for us and once for the tree */ free_extent_map(em); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index b6ee70a039c7..adfd7583a17b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -114,8 +114,7 @@ struct btrfs_block_group { /* For block groups in the same raid type */ struct list_head list; - /* Usage count */ - atomic_t count; + refcount_t refs; /* * List of struct btrfs_free_clusters for this block group. diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e7d709505cb1..c47b6c6fea9f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -152,6 +152,17 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * The id/generation of the last transaction where this inode was + * either the source or the destination of a clone/dedupe operation. + * Used when logging an inode to know if there are shared extents that + * need special care when logging checksum items, to avoid duplicate + * checksum items in a log (which can lead to a corruption where we end + * up with missing checksum ranges after log replay). + * Protected by the vfs inode lock. + */ + u64 last_reflink_trans; + + /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. */ diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 32e11a23b47f..81a8c87a5afb 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -631,10 +631,8 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, int pass; selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); - if (NULL == selected_super) { - pr_info("btrfsic: error, kmalloc failed!\n"); + if (!selected_super) return -ENOMEM; - } list_for_each_entry(device, dev_head, dev_list) { int i; @@ -795,7 +793,6 @@ static int btrfsic_process_superblock_dev_mirror( if (NULL == superblock_tmp) { superblock_tmp = btrfsic_block_alloc(); if (NULL == superblock_tmp) { - pr_info("btrfsic: error, kmalloc failed!\n"); ret = -1; goto out; } @@ -921,9 +918,7 @@ static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) struct btrfsic_stack_frame *sf; sf = kzalloc(sizeof(*sf), GFP_NOFS); - if (NULL == sf) - pr_info("btrfsic: alloc memory failed!\n"); - else + if (sf) sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; return sf; } @@ -1313,7 +1308,6 @@ static int btrfsic_create_link_to_next_block( if (NULL == l) { l = btrfsic_block_link_alloc(); if (NULL == l) { - pr_info("btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(next_block_ctx); *next_blockp = NULL; return -1; @@ -1470,7 +1464,6 @@ static int btrfsic_handle_extent_data( mirror_num, &block_was_created); if (NULL == next_block) { - pr_info("btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(&next_block_ctx); return -1; } @@ -2013,7 +2006,6 @@ again: block = btrfsic_block_alloc(); if (NULL == block) { - pr_info("btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(&block_ctx); goto continue_loop; } @@ -2234,7 +2226,6 @@ static int btrfsic_process_written_superblock( mirror_num, &was_created); if (NULL == next_block) { - pr_info("btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(&tmp_next_block_ctx); return -1; } @@ -2542,10 +2533,8 @@ static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( &state->block_link_hashtable); if (NULL == l) { l = btrfsic_block_link_alloc(); - if (NULL == l) { - pr_info("btrfsic: error, kmalloc failed!\n"); + if (!l) return NULL; - } l->block_ref_to = next_block; l->block_ref_from = from_block; @@ -2589,10 +2578,9 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( struct btrfsic_dev_state *dev_state; block = btrfsic_block_alloc(); - if (NULL == block) { - pr_info("btrfsic: error, kmalloc failed!\n"); + if (!block) return NULL; - } + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); if (NULL == dev_state) { pr_info("btrfsic: error, lookup dev_state failed!\n"); @@ -2797,10 +2785,8 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, return -1; } state = kvzalloc(sizeof(*state), GFP_KERNEL); - if (!state) { - pr_info("btrfs check-integrity: allocation failed!\n"); + if (!state) return -ENOMEM; - } if (!btrfsic_is_initialized) { mutex_init(&btrfsic_mutex); @@ -2829,7 +2815,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, ds = btrfsic_dev_state_alloc(); if (NULL == ds) { - pr_info("btrfs check-integrity: kmalloc() failed!\n"); mutex_unlock(&btrfsic_mutex); return -ENOMEM; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6e648603f85..1ab56a734e70 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -172,18 +172,17 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; } -static int check_compressed_csum(struct btrfs_inode *inode, - struct compressed_bio *cb, +static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, u64 disk_start) { struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - int ret; struct page *page; unsigned long i; char *kaddr; u8 csum[BTRFS_CSUM_SIZE]; + struct compressed_bio *cb = bio->bi_private; u8 *cb_sum = cb->sums; if (inode->flags & BTRFS_INODE_NODATASUM) @@ -201,15 +200,15 @@ static int check_compressed_csum(struct btrfs_inode *inode, if (memcmp(&csum, cb_sum, csum_size)) { btrfs_print_data_csum_error(inode, disk_start, csum, cb_sum, cb->mirror_num); - ret = -EIO; - goto fail; + if (btrfs_io_bio(bio)->device) + btrfs_dev_stat_inc_and_print( + btrfs_io_bio(bio)->device, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + return -EIO; } cb_sum += csum_size; - } - ret = 0; -fail: - return ret; + return 0; } /* when we finish reading compressed pages from the disk, we @@ -244,7 +243,6 @@ static void end_compressed_bio_read(struct bio *bio) * Record the correct mirror_num in cb->orig_bio so that * read-repair can work properly. */ - ASSERT(btrfs_io_bio(cb->orig_bio)); btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; cb->mirror_num = mirror; @@ -256,7 +254,7 @@ static void end_compressed_bio_read(struct bio *bio) goto csum_failed; inode = cb->inode; - ret = check_compressed_csum(BTRFS_I(inode), cb, + ret = check_compressed_csum(BTRFS_I(inode), bio, (u64)bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; @@ -405,7 +403,7 @@ out: * This also checksums the file bytes and gets things ready for * the end io hooks. */ -blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, @@ -413,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned int write_flags, struct cgroup_subsys_state *blkcg_css) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; struct compressed_bio *cb; unsigned long bytes_left; @@ -421,7 +419,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page *page; u64 first_byte = disk_start; blk_status_t ret; - int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -429,7 +427,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, return BLK_STS_RESOURCE; refcount_set(&cb->pending_bios, 0); cb->errors = 0; - cb->inode = inode; + cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; cb->mirror_num = 0; @@ -455,7 +453,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, int submit = 0; page = compressed_pages[pg_index]; - page->mapping = inode->i_mapping; + page->mapping = inode->vfs_inode.i_mapping; if (bio->bi_iter.bi_size) submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, 0); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 284a3ad31350..9f3dbe372631 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -8,6 +8,8 @@ #include <linux/sizes.h> +struct btrfs_inode; + /* * We want to make sure that amount of RAM required to uncompress an extent is * reasonable, so we limit the total size in ram of a compressed extent to @@ -88,7 +90,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); -blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 82ab6e5a386d..70e49d8d4f6c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1501,6 +1501,22 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize) return 0; } +#ifdef __LITTLE_ENDIAN + +/* + * Compare two keys, on little-endian the disk order is same as CPU order and + * we can avoid the conversion. + */ +static int comp_keys(const struct btrfs_disk_key *disk_key, + const struct btrfs_key *k2) +{ + const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; + + return btrfs_comp_cpu_keys(k1, k2); +} + +#else + /* * compare two keys in a memcmp fashion */ @@ -1513,6 +1529,7 @@ static int comp_keys(const struct btrfs_disk_key *disk, return btrfs_comp_cpu_keys(&k1, k2); } +#endif /* * same as comp_keys only with two btrfs_key's diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d404cce8ae40..9c7e466f27a9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -546,11 +546,6 @@ enum { */ BTRFS_FS_EXCL_OP, /* - * To info transaction_kthread we need an immediate commit so it - * doesn't need to wait for commit_interval - */ - BTRFS_FS_NEED_ASYNC_COMMIT, - /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. * Set and cleared while holding fs_info::balance_mutex. @@ -779,6 +774,7 @@ struct btrfs_fs_info { u32 thread_pool_size; struct kobject *space_info_kobj; + struct kobject *qgroups_kobj; u64 total_pinned; @@ -1011,6 +1007,8 @@ enum { BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used only for subvolume roots. */ BTRFS_ROOT_HAS_LOG_TREE, + /* Qgroup flushing is in progress */ + BTRFS_ROOT_QGROUP_FLUSHING, }; /* @@ -1059,8 +1057,10 @@ struct btrfs_root { wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; struct list_head log_ctxs[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_writers; atomic_t log_commit[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; int log_transid; /* No matter the commit succeeds or not*/ @@ -1075,7 +1075,6 @@ struct btrfs_root { u64 highest_objectid; - u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; @@ -1162,6 +1161,7 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + wait_queue_head_t qgroup_flush_wait; /* Number of active swapfiles */ atomic_t nr_swapfiles; @@ -1277,18 +1277,18 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) BTRFS_MOUNT_##opt) #define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -{ \ +do { \ if (!btrfs_test_opt(fs_info, opt)) \ btrfs_info(fs_info, fmt, ##args); \ btrfs_set_opt(fs_info->mount_opt, opt); \ -} +} while (0) #define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -{ \ +do { \ if (btrfs_test_opt(fs_info, opt)) \ btrfs_info(fs_info, fmt, ##args); \ btrfs_clear_opt(fs_info->mount_opt, opt); \ -} +} while (0) /* * Requests for changes that need to be done during transaction commit. @@ -1895,6 +1895,52 @@ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); +#ifdef __LITTLE_ENDIAN + +/* + * Optimized helpers for little-endian architectures where CPU and on-disk + * structures have the same endianness and we can skip conversions. + */ + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, + const struct btrfs_disk_key *disk_key) +{ + memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, + const struct btrfs_key *cpu_key) +{ + memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_node_key(eb, disk_key, nr); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_item_key(eb, disk_key, nr); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *cpu_key) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_dir_item_key(eb, item, disk_key); +} + +#else + static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, const struct btrfs_disk_key *disk) { @@ -1936,6 +1982,8 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, btrfs_disk_key_to_cpu(key, &disk_key); } +#endif + /* struct btrfs_header */ BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, @@ -2232,7 +2280,8 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) } /* struct btrfs_file_extent_item */ -BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, + type, 8); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, @@ -2241,6 +2290,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, + struct btrfs_file_extent_item, ram_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, @@ -2257,6 +2308,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; } +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, @@ -2508,16 +2560,46 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); +/* + * Different levels for to flush space when doing space reservations. + * + * The higher the level, the more methods we try to reclaim space. + */ enum btrfs_reserve_flush_enum { /* If we are in the transaction, we can't flush anything.*/ BTRFS_RESERVE_NO_FLUSH, + /* - * Flushing delalloc may cause deadlock somewhere, in this - * case, use FLUSH LIMIT + * Flush space by: + * - Running delayed inode items + * - Allocating a new chunk */ BTRFS_RESERVE_FLUSH_LIMIT, + + /* + * Flush space by: + * - Running delayed inode items + * - Running delayed refs + * - Running delalloc and waiting for ordered extents + * - Allocating a new chunk + */ BTRFS_RESERVE_FLUSH_EVICT, + + /* + * Flush space by above mentioned methods and by: + * - Running delayed iputs + * - Commiting transaction + * + * Can be interruped by fatal signal. + */ BTRFS_RESERVE_FLUSH_ALL, + + /* + * Pretty much the same as FLUSH_ALL, but can also steal space from + * global rsv. + * + * Can be interruped by fatal signal. + */ BTRFS_RESERVE_FLUSH_ALL_STEAL, }; @@ -2831,8 +2913,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, - u64 file_start, int contig); +blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, + u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, @@ -2875,7 +2957,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); -int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, @@ -2928,7 +3010,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); @@ -2962,7 +3044,7 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, u64 start, u64 end, u64 *drop_end, int drop_cache, int replace_extent, @@ -2978,10 +3060,13 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct inode *inode, struct page **pages, +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes); +void btrfs_check_nocow_unlock(struct btrfs_inode *inode); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -3194,7 +3279,7 @@ do { \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ - if ((errno) != -EIO) { \ + if ((errno) != -EIO && (errno) != -EROFS) { \ WARN(1, KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ (errno)); \ @@ -3378,7 +3463,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_root *root); -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 1245739a3a6e..0e354e9e57d0 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -237,10 +237,10 @@ commit_trans: return 0; } -int btrfs_check_data_free_space(struct inode *inode, +int btrfs_check_data_free_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; /* align the range */ @@ -248,14 +248,14 @@ int btrfs_check_data_free_space(struct inode *inode, round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); + ret = btrfs_alloc_data_chunk_ondemand(inode, len); if (ret < 0) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) - btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_free_reserved_data_space_noquota(fs_info, len); else ret = 0; return ret; @@ -269,16 +269,12 @@ int btrfs_check_data_free_space(struct inode *inode, * which we can't sleep and is sure it won't affect qgroup reserved space. * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, +void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_space_info *data_sinfo; - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); + ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); @@ -293,17 +289,17 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, +void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = inode->root->fs_info; /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, root->fs_info->sectorsize) - - round_down(start, root->fs_info->sectorsize); - start = round_down(start, root->fs_info->sectorsize); + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); - btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_free_reserved_data_space_noquota(fs_info, len); btrfs_qgroup_free_data(inode, reserved, start, len); } @@ -557,7 +553,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) * Return 0 for success * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, +int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len) { int ret; @@ -565,7 +561,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + ret = btrfs_delalloc_reserve_metadata(inode, len); if (ret < 0) btrfs_free_reserved_data_space(inode, *reserved, start, len); return ret; @@ -583,10 +579,10 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * list if there are no delalloc bytes left. * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, +void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); + btrfs_delalloc_release_metadata(inode, len, qgroup_free); btrfs_free_reserved_data_space(inode, reserved, start, len); } diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 54466fbd7075..28bf5c3ef430 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -6,18 +6,18 @@ struct extent_changeset; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -int btrfs_check_data_free_space(struct inode *inode, +int btrfs_check_data_free_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_free_reserved_data_space(struct inode *inode, +void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, +void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, +void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, u64 len); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); -int btrfs_delalloc_reserve_space(struct inode *inode, +int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); #endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b1a148058773..9ae25f632157 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1116,6 +1116,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); + init_waitqueue_head(&root->qgroup_flush_wait); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); @@ -1141,10 +1142,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - if (!dummy) - root->defrag_trans_start = fs_info->generation; - else - root->defrag_trans_start = 0; root->root_key.objectid = objectid; root->anon_dev = 0; @@ -1395,7 +1392,12 @@ alloc_fail: goto out; } -static int btrfs_init_fs_root(struct btrfs_root *root) +/* + * Initialize subvolume root in-memory structure + * + * @anon_dev: anonymous device to attach to the root, if zero, allocate new + */ +static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { int ret; unsigned int nofs_flag; @@ -1428,9 +1430,20 @@ static int btrfs_init_fs_root(struct btrfs_root *root) spin_lock_init(&root->ino_cache_lock); init_waitqueue_head(&root->ino_cache_wait); - ret = get_anon_bdev(&root->anon_dev); - if (ret) - goto fail; + /* + * Don't assign anonymous block device to roots that are not exposed to + * userspace, the id pool is limited to 1M + */ + if (is_fstree(root->root_key.objectid) && + btrfs_root_refs(&root->root_item) > 0) { + if (!anon_dev) { + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto fail; + } else { + root->anon_dev = anon_dev; + } + } mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, @@ -1534,8 +1547,27 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) } -struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, bool check_ref) +/* + * Get an in-memory reference of a root structure. + * + * For essential trees like root/extent tree, we grab it from fs_info directly. + * For subvolume trees, we check the cached filesystem roots first. If not + * found, then read it from disk and add it to cached fs roots. + * + * Caller should release the root by calling btrfs_put_root() after the usage. + * + * NOTE: Reloc and log trees can't be read by this function as they share the + * same root objectid. + * + * @objectid: root id + * @anon_dev: preallocated anonymous block device number for new roots, + * pass 0 for new allocation. + * @check_ref: whether to check root item references, If true, return -ENOENT + * for orphan roots + */ +static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev, + bool check_ref) { struct btrfs_root *root; struct btrfs_path *path; @@ -1564,6 +1596,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { + /* Shouldn't get preallocated anon_dev for cached roots */ + ASSERT(!anon_dev); if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); return ERR_PTR(-ENOENT); @@ -1583,7 +1617,7 @@ again: goto fail; } - ret = btrfs_init_fs_root(root); + ret = btrfs_init_fs_root(root, anon_dev); if (ret) goto fail; @@ -1616,25 +1650,31 @@ fail: return ERR_PTR(ret); } -static int btrfs_congested_fn(void *congested_data, int bdi_bits) +/* + * Get in-memory reference of a root structure + * + * @objectid: tree objectid + * @check_ref: if set, verify that the tree exists and the item has at least + * one reference + */ +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, bool check_ref) { - struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; - int ret = 0; - struct btrfs_device *device; - struct backing_dev_info *bdi; + return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); +} - rcu_read_lock(); - list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { - if (!device->bdev) - continue; - bdi = device->bdev->bd_bdi; - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - rcu_read_unlock(); - return ret; +/* + * Get in-memory reference of a root structure, created as new, optionally pass + * the anonymous block device id + * + * @objectid: tree objectid + * @anon_dev: if zero, allocate a new anonymous block device or use the + * parameter value + */ +struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev) +{ + return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); } /* @@ -1749,7 +1789,6 @@ static int transaction_kthread(void *arg) now = ktime_get_seconds(); if (cur->state < TRANS_STATE_COMMIT_START && - !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { spin_unlock(&fs_info->trans_lock); @@ -2001,8 +2040,7 @@ void btrfs_put_root(struct btrfs_root *root) if (root->anon_dev) free_anon_bdev(root->anon_dev); btrfs_drew_lock_destroy(&root->snapshot_lock); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); + free_root_extent_buffers(root); kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); #ifdef CONFIG_BTRFS_DEBUG @@ -3053,8 +3091,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sb_buffer; } - sb->s_bdi->congested_fn = btrfs_congested_fn; - sb->s_bdi->congested_data = fs_info; sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); @@ -4058,6 +4094,11 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) ASSERT(list_empty(&fs_info->delayed_iputs)); set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); + if (btrfs_check_quota_leak(fs_info)) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err(fs_info, "qgroup reserved space leaked"); + } + btrfs_free_qgroup_config(fs_info); ASSERT(list_empty(&fs_info->delalloc_roots)); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bf43245406c4..00dc39d47ed3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); +struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index b6561455b3c4..f39d47a2d01a 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -233,14 +233,11 @@ bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, struct extent_state **cached_state); /* This should be reworked in the future and put elsewhere. */ -int get_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record **failrec); +struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start); int set_state_failrec(struct extent_io_tree *tree, u64 start, struct io_failure_record *failrec); void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); -int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, - struct io_failure_record **failrec_ret); int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c0bc35f932bf..61ede335f6c3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5298,7 +5298,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out; } - trans = btrfs_start_transaction(tree_root, 0); + /* + * Use join to avoid potential EINTR from transaction start. See + * wait_reserve_ticket and the whole reservation callchain. + */ + if (for_reloc) + trans = btrfs_join_transaction(tree_root); + else + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; @@ -5466,6 +5473,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) } } + /* + * This subvolume is going to be completely dropped, and won't be + * recorded as dirty roots, thus pertrans meta rsv will not be freed at + * commit transaction time. So free it here manually. + */ + btrfs_qgroup_convert_reserved_meta(root, INT_MAX); + btrfs_qgroup_free_meta_all_pertrans(root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 60278e52c37a..6def411b2eba 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2018,15 +2018,14 @@ out: return err; } -void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, unsigned clear_bits, unsigned long page_ops) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, - NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); - __process_pages_contig(inode->i_mapping, locked_page, + __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start >> PAGE_SHIFT, end >> PAGE_SHIFT, page_ops, NULL); } @@ -2123,12 +2122,11 @@ out: return ret; } -int get_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record **failrec) +struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) { struct rb_node *node; struct extent_state *state; - int ret = 0; + struct io_failure_record *failrec; spin_lock(&tree->lock); /* @@ -2137,18 +2135,19 @@ int get_state_failrec(struct extent_io_tree *tree, u64 start, */ node = tree_search(tree, start); if (!node) { - ret = -ENOENT; + failrec = ERR_PTR(-ENOENT); goto out; } state = rb_entry(node, struct extent_state, rb_node); if (state->start != start) { - ret = -ENOENT; + failrec = ERR_PTR(-ENOENT); goto out; } - *failrec = state->failrec; + + failrec = state->failrec; out: spin_unlock(&tree->lock); - return ret; + return failrec; } /* @@ -2378,8 +2377,8 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, if (!ret) return 0; - ret = get_state_failrec(failure_tree, start, &failrec); - if (ret) + failrec = get_state_failrec(failure_tree, start); + if (IS_ERR(failrec)) return 0; BUG_ON(!failrec->this_mirror); @@ -2451,8 +2450,8 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) spin_unlock(&failure_tree->lock); } -int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, - struct io_failure_record **failrec_ret) +static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, + u64 start, u64 end) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct io_failure_record *failrec; @@ -2463,65 +2462,8 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, int ret; u64 logical; - ret = get_state_failrec(failure_tree, start, &failrec); - if (ret) { - failrec = kzalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - - failrec->start = start; - failrec->len = end - start + 1; - failrec->this_mirror = 0; - failrec->bio_flags = 0; - failrec->in_validation = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return -EIO; - } - - if (em->start > start || em->start + em->len <= start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - if (!em) { - kfree(failrec); - return -EIO; - } - - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, - em->compress_type); - } - - btrfs_debug(fs_info, - "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", - logical, start, failrec->len); - - failrec->logical = logical; - free_extent_map(em); - - /* set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret >= 0) - ret = set_state_failrec(failure_tree, start, failrec); - /* set the bits in the inode's tree */ - if (ret >= 0) - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); - if (ret < 0) { - kfree(failrec); - return ret; - } - } else { + failrec = get_state_failrec(failure_tree, start); + if (!IS_ERR(failrec)) { btrfs_debug(fs_info, "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", failrec->logical, failrec->start, failrec->len, @@ -2531,11 +2473,66 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ + + return failrec; } - *failrec_ret = failrec; + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return ERR_PTR(-ENOMEM); - return 0; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return ERR_PTR(-EIO); + } + + if (em->start > start || em->start + em->len <= start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + if (!em) { + kfree(failrec); + return ERR_PTR(-EIO); + } + + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, em->compress_type); + } + + btrfs_debug(fs_info, + "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", + logical, start, failrec->len); + + failrec->logical = logical; + free_extent_map(em); + + /* Set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY); + if (ret >= 0) { + ret = set_state_failrec(failure_tree, start, failrec); + /* Set the bits in the inode's tree */ + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); + } else if (ret < 0) { + kfree(failrec); + return ERR_PTR(ret); + } + + return failrec; } static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, @@ -2660,16 +2657,15 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, struct bio *repair_bio; struct btrfs_io_bio *repair_io_bio; blk_status_t status; - int ret; btrfs_debug(fs_info, "repair read error: read error at %llu", start); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - ret = btrfs_get_io_failure_record(inode, start, end, &failrec); - if (ret) - return errno_to_blk_status(ret); + failrec = btrfs_get_io_failure_record(inode, start, end); + if (IS_ERR(failrec)) + return errno_to_blk_status(PTR_ERR(failrec)); need_validation = btrfs_io_needs_validation(inode, failed_bio); @@ -3420,7 +3416,7 @@ static void update_nr_written(struct writeback_control *wbc, * This returns 0 if all went well (page still locked) * This returns < 0 if there were errors (page still locked) */ -static noinline_for_stack int writepage_delalloc(struct inode *inode, +static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, u64 delalloc_start, unsigned long *nr_written) { @@ -3433,7 +3429,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, while (delalloc_end < page_end) { - found = find_lock_delalloc_range(inode, page, + found = find_lock_delalloc_range(&inode->vfs_inode, page, &delalloc_start, &delalloc_end); if (!found) { @@ -3450,8 +3446,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, * started, so we don't want to return > 0 unless * things are going well. */ - ret = ret < 0 ? ret : -EIO; - goto done; + return ret < 0 ? ret : -EIO; } /* * delalloc_end is already one less than the total length, so @@ -3483,10 +3478,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, return 1; } - ret = 0; - -done: - return ret; + return 0; } /* @@ -3497,7 +3489,7 @@ done: * 0 if all went well (page still locked) * < 0 if there were errors (page still locked) */ -static noinline_for_stack int __extent_writepage_io(struct inode *inode, +static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, struct extent_page_data *epd, @@ -3505,7 +3497,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, unsigned long nr_written, int *nr_ret) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); u64 page_end = start + PAGE_SIZE - 1; u64 end; @@ -3537,7 +3529,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, update_nr_written(wbc, nr_written + 1); end = page_end; - blocksize = inode->i_sb->s_blocksize; + blocksize = inode->vfs_inode.i_sb->s_blocksize; while (cur <= end) { u64 em_end; @@ -3548,8 +3540,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, 1); break; } - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur, - end - cur + 1); + em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); ret = PTR_ERR_OR_ZERO(em); @@ -3586,7 +3577,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, btrfs_set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { - btrfs_err(BTRFS_I(inode)->root->fs_info, + btrfs_err(inode->root->fs_info, "page %lu not writeback, cur %llu end %llu", page->index, cur, end); } @@ -3659,15 +3650,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); if (!epd->extent_locked) { - ret = writepage_delalloc(inode, page, wbc, start, &nr_written); + ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, + &nr_written); if (ret == 1) return 0; if (ret) goto done; } - ret = __extent_writepage_io(inode, page, wbc, epd, - i_size, nr_written, &nr); + ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, + nr_written, &nr); if (ret == 1) return 0; @@ -4127,7 +4119,7 @@ retry: if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { ret = flush_write_bio(&epd); } else { - ret = -EUCLEAN; + ret = -EROFS; end_write_bio(&epd, ret); } return ret; @@ -4489,6 +4481,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) page->mapping->host->i_size > SZ_16M) { u64 len; while (start <= end) { + struct btrfs_fs_info *fs_info; + u64 cur_gen; + len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); @@ -4502,20 +4497,52 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) free_extent_map(em); break; } - if (!test_range_bit(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED, 0, NULL)) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &btrfs_inode->runtime_flags); - remove_extent_mapping(map, em); - /* once for the rb tree */ - free_extent_map(em); - } + if (test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED, 0, NULL)) + goto next; + /* + * If it's not in the list of modified extents, used + * by a fast fsync, we can remove it. If it's being + * logged we can safely remove it since fsync took an + * extra reference on the em. + */ + if (list_empty(&em->list) || + test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + goto remove_em; + /* + * If it's in the list of modified extents, remove it + * only if its generation is older then the current one, + * in which case we don't need it for a fast fsync. + * Otherwise don't remove it, we could be racing with an + * ongoing fast fsync that could miss the new extent. + */ + fs_info = btrfs_inode->root->fs_info; + spin_lock(&fs_info->trans_lock); + cur_gen = fs_info->generation; + spin_unlock(&fs_info->trans_lock); + if (em->generation >= cur_gen) + goto next; +remove_em: + /* + * We only remove extent maps that are not in the list of + * modified extents or that are in the list but with a + * generation lower then the current generation, so there + * is no need to set the full fsync flag on the inode (it + * hurts the fsync performance for workloads with a data + * size that exceeds or is close to the system's memory). + */ + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); +next: start = extent_map_end(em); write_unlock(&map->lock); /* once for us */ free_extent_map(em); + + cond_resched(); /* Allow large-extent preemption. */ } } return try_release_extent_state(tree, page, mask); @@ -4670,7 +4697,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, } int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) + u64 start, u64 len) { int ret = 0; u64 off = start; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 87f60a48f750..00a88f2eb5ab 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -204,7 +204,7 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len); + u64 start, u64 len); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, @@ -277,7 +277,7 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(const struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); -void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 706a3128e192..7d5ec71615b8 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -522,10 +522,10 @@ fail: * means this bio can contains potentially discontigous bio vecs * so the logical offset of each should be calculated separately. */ -blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, u64 file_start, int contig) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b0d2c976587e..bb824c7cb7c7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -500,18 +500,18 @@ next: * this also makes the decision about creating an inline extent vs * doing real data extents, marking pages dirty and delalloc as required. */ -int btrfs_dirty_pages(struct inode *inode, struct page **pages, +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int err = 0; int i; u64 num_bytes; u64 start_pos; u64 end_of_last_block; u64 end_pos = pos + write_bytes; - loff_t isize = i_size_read(inode); + loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; start_pos = pos & ~((u64) fs_info->sectorsize - 1); @@ -524,13 +524,13 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, * The pages may have already been dirty, clear out old accounting so * we can set things up properly */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block, + clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); - if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (!btrfs_is_free_space_inode(inode)) { if (start_pos >= isize && - !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) { + !(inode->flags & BTRFS_INODE_PREALLOC)) { /* * There can't be any extents following eof in this case * so just set the delalloc new bit for the range @@ -538,8 +538,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, */ extra_bits |= EXTENT_DELALLOC_NEW; } else { - err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode), - start_pos, + err = btrfs_find_new_delalloc_bytes(inode, start_pos, num_bytes, cached); if (err) return err; @@ -564,7 +563,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, * at this time. */ if (end_pos > isize) - i_size_write(inode, end_pos); + i_size_write(&inode->vfs_inode, end_pos); return 0; } @@ -731,7 +730,7 @@ next: * is deleted from the tree. */ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, u64 start, u64 end, u64 *drop_end, int drop_cache, int replace_extent, @@ -744,7 +743,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct inode *vfs_inode = &inode->vfs_inode; + u64 ino = btrfs_ino(inode); u64 search_start = start; u64 disk_bytenr = 0; u64 num_bytes = 0; @@ -762,9 +762,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int leafs_visited = 0; if (drop_cache) - btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0); + btrfs_drop_extent_cache(inode, start, end - 1, 0); - if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) + if (start >= inode->disk_i_size && !replace_extent) modify_tree = 0; update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || @@ -935,7 +935,7 @@ next_slot: extent_end - end); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(inode, end - key.offset); + inode_sub_bytes(vfs_inode, end - key.offset); break; } @@ -955,7 +955,7 @@ next_slot: start - key.offset); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(inode, extent_end - start); + inode_sub_bytes(vfs_inode, extent_end - start); if (end == extent_end) break; @@ -979,7 +979,7 @@ delete_extent_item: if (update_refs && extent_type == BTRFS_FILE_EXTENT_INLINE) { - inode_sub_bytes(inode, + inode_sub_bytes(vfs_inode, extent_end - key.offset); extent_end = ALIGN(extent_end, fs_info->sectorsize); @@ -993,7 +993,7 @@ delete_extent_item: key.offset - extent_offset); ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ - inode_sub_bytes(inode, + inode_sub_bytes(vfs_inode, extent_end - key.offset); } @@ -1082,8 +1082,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, - drop_cache, 0, 0, NULL); + ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start, + end, NULL, drop_cache, 0, 0, NULL); btrfs_free_path(path); return ret; } @@ -1532,8 +1532,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait) +static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1541,6 +1541,9 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, u64 num_bytes; int ret; + if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + return 0; + if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; @@ -1583,6 +1586,42 @@ out_unlock: return ret; } +static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) +{ + return check_can_nocow(inode, pos, write_bytes, true); +} + +/* + * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) + * + * @pos: File offset + * @write_bytes: The length to write, will be updated to the nocow writeable + * range + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks. + * + * Return: + * >0 and update @write_bytes if we can do nocow write + * 0 if we can't do nocow write + * -EAGAIN if we can't get the needed lock or there are ordered extents + * for * (nowait == true) case + * <0 if other error happened + * + * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock(). + */ +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) +{ + return check_can_nocow(inode, pos, write_bytes, false); +} + +void btrfs_check_nocow_unlock(struct btrfs_inode *inode) +{ + btrfs_drew_write_unlock(&inode->root->snapshot_lock); +} + static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) { @@ -1590,7 +1629,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; @@ -1643,13 +1681,12 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, fs_info->sectorsize); extent_changeset_release(data_reserved); - ret = btrfs_check_data_free_space(inode, &data_reserved, pos, + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &data_reserved, pos, write_bytes); if (ret < 0) { - if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) && - check_can_nocow(BTRFS_I(inode), pos, - &write_bytes, false) > 0) { + if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, + &write_bytes) > 0) { /* * For nodata cow case, no need to reserve * data space. @@ -1674,11 +1711,11 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, + btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, pos, write_bytes); else - btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_check_nocow_unlock(BTRFS_I(inode)); break; } @@ -1748,7 +1785,7 @@ again: __pos = round_down(pos, fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, __pos, release_bytes, true); } @@ -1758,8 +1795,9 @@ again: fs_info->sectorsize); if (copied > 0) - ret = btrfs_dirty_pages(inode, pages, dirty_pages, - pos, copied, &cached_state); + ret = btrfs_dirty_pages(BTRFS_I(inode), pages, + dirty_pages, pos, copied, + &cached_state); /* * If we have not locked the extent range, because the range's @@ -1782,7 +1820,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_check_nocow_unlock(BTRFS_I(inode)); if (only_release_metadata && copied > 0) { lockstart = round_down(pos, @@ -1800,8 +1838,6 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1) - btrfs_btree_balance_dirty(fs_info); pos += copied; num_written += copied; @@ -1811,11 +1847,12 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_check_nocow_unlock(BTRFS_I(inode)); btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, round_down(pos, fs_info->sectorsize), release_bytes, true); } @@ -1926,10 +1963,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * We will allocate space in case nodatacow is not set, * so bail */ - if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes, - true) <= 0) { + if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) + <= 0) { inode_unlock(inode); return -EAGAIN; } @@ -2598,7 +2633,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, cur_offset = start; while (cur_offset < end) { - ret = __btrfs_drop_extents(trans, root, inode, path, + ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, cur_offset, end + 1, &drop_end, 1, 0, 0, NULL); if (ret != -ENOSPC) { @@ -3176,14 +3211,14 @@ reserve_space: if (ret < 0) goto out; space_reserved = true; - ret = btrfs_qgroup_reserve_data(inode, &data_reserved, - alloc_start, bytes_to_reserve); - if (ret) - goto out; ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); if (ret) goto out; + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, + alloc_start, bytes_to_reserve); + if (ret) + goto out; ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, i_blocksize(inode), @@ -3199,7 +3234,7 @@ reserve_space: ret = btrfs_fallocate_update_isize(inode, offset + len, mode); out: if (ret && space_reserved) - btrfs_free_reserved_data_space(inode, data_reserved, + btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, alloc_start, bytes_to_reserve); extent_changeset_free(data_reserved); @@ -3350,8 +3385,9 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, &data_reserved, - cur_offset, last_byte - cur_offset); + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), + &data_reserved, cur_offset, + last_byte - cur_offset); if (ret < 0) { cur_offset = last_byte; free_extent_map(em); @@ -3363,8 +3399,9 @@ static long btrfs_fallocate(struct file *file, int mode, * range, free reserved data space first, otherwise * it'll result in false ENOSPC error. */ - btrfs_free_reserved_data_space(inode, data_reserved, - cur_offset, last_byte - cur_offset); + btrfs_free_reserved_data_space(BTRFS_I(inode), + data_reserved, cur_offset, + last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -3381,7 +3418,7 @@ static long btrfs_fallocate(struct file *file, int mode, range->len, i_blocksize(inode), offset + len, &alloc_hint); else - btrfs_free_reserved_data_space(inode, + btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, range->start, range->len); list_del(&range->list); @@ -3402,7 +3439,7 @@ out: inode_unlock(inode); /* Let go of our reservation. */ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) - btrfs_free_reserved_data_space(inode, data_reserved, + btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; @@ -3500,7 +3537,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) static int btrfs_file_open(struct inode *inode, struct file *filp) { - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return generic_file_open(inode, filp); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 55955bd424d7..6d961e11639e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1334,8 +1334,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0, - i_size_read(inode), &cached_state); + ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, + io_ctl->num_pages, 0, i_size_read(inode), + &cached_state); if (ret) goto out_nospc; @@ -2703,8 +2704,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group) * pointed to by the cluster, someone else raced in and freed the * cluster already. In that case, we just return without changing anything */ -static int -__btrfs_return_cluster_to_free_space( +static void __btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { @@ -2756,7 +2756,6 @@ __btrfs_return_cluster_to_free_space( out: spin_unlock(&cluster->lock); btrfs_put_block_group(block_group); - return 0; } static void __btrfs_remove_free_space_cache_locked( @@ -2907,12 +2906,11 @@ out: * Otherwise, it'll get a reference on the block group pointed to by the * cluster and remove the cluster from it. */ -int btrfs_return_cluster_to_free_space( +void btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl; - int ret; /* first, get a safe pointer to the block group */ spin_lock(&cluster->lock); @@ -2920,28 +2918,27 @@ int btrfs_return_cluster_to_free_space( block_group = cluster->block_group; if (!block_group) { spin_unlock(&cluster->lock); - return 0; + return; } } else if (cluster->block_group != block_group) { /* someone else has already freed it don't redo their work */ spin_unlock(&cluster->lock); - return 0; + return; } - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); spin_unlock(&cluster->lock); ctl = block_group->free_space_ctl; /* now return any extents the cluster had on it */ spin_lock(&ctl->tree_lock); - ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + __btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&ctl->tree_lock); btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); /* finally drop our ref */ btrfs_put_block_group(block_group); - return ret; } static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, @@ -3358,7 +3355,7 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, list_del_init(&entry->list); if (!ret) { - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 2e0a8077aa74..e3d5e0ad8f8e 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -136,7 +136,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size); -int btrfs_return_cluster_to_free_space( +void btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster); int btrfs_trim_block_group(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 6009e0e939b5..76d2e43817ea 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -495,7 +495,8 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0, + prealloc); if (ret) goto out_put; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6862cd7e21a9..611b3412fbfd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -80,17 +80,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); -static noinline int cow_file_range(struct inode *inode, +static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); -static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, - u64 orig_start, u64 block_start, +static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, + u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); -static void __endio_write_update_ordered(struct inode *inode, +static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate); @@ -104,7 +104,7 @@ static void __endio_write_update_ordered(struct inode *inode, * to be released, which we want to happen only when finishing the ordered * extent (btrfs_finish_ordered_io()). */ -static inline void btrfs_cleanup_ordered_extents(struct inode *inode, +static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *locked_page, u64 offset, u64 bytes) { @@ -116,7 +116,7 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, struct page *page; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); + page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; @@ -274,15 +274,15 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct inode *inode, u64 start, +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, u64 end, size_t compressed_size, int compress_type, struct page **compressed_pages) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; u64 aligned_end = ALIGN(end, fs_info->sectorsize); @@ -314,7 +314,7 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start, btrfs_free_path(path); return PTR_ERR(trans); } - trans->block_rsv = &BTRFS_I(inode)->block_rsv; + trans->block_rsv = &inode->block_rsv; if (compressed_size && compressed_pages) extent_item_size = btrfs_file_extent_calc_inline_size( @@ -323,9 +323,9 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start, extent_item_size = btrfs_file_extent_calc_inline_size( inline_len); - ret = __btrfs_drop_extents(trans, root, inode, path, - start, aligned_end, NULL, - 1, 1, extent_item_size, &extent_inserted); + ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end, + NULL, 1, 1, extent_item_size, + &extent_inserted); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -334,7 +334,7 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start, if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); ret = insert_inline_extent(trans, path, extent_inserted, - root, inode, start, + root, &inode->vfs_inode, start, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { @@ -345,8 +345,8 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start, goto out; } - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -412,10 +412,10 @@ static noinline int add_async_extent(struct async_chunk *cow, /* * Check if the inode has flags compatible with compression */ -static inline bool inode_can_compress(struct inode *inode) +static inline bool inode_can_compress(struct btrfs_inode *inode) { - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW || - BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) return false; return true; } @@ -424,29 +424,30 @@ static inline bool inode_can_compress(struct inode *inode) * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ -static inline int inode_need_compress(struct inode *inode, u64 start, u64 end) +static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, + u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", - btrfs_ino(BTRFS_I(inode))); + btrfs_ino(inode)); return 0; } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* defrag ioctl */ - if (BTRFS_I(inode)->defrag_compress) + if (inode->defrag_compress) return 1; /* bad compression ratios */ - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) || - BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || - BTRFS_I(inode)->prop_compress) - return btrfs_compress_heuristic(inode, start, end); + inode->flags & BTRFS_INODE_COMPRESS || + inode->prop_compress) + return btrfs_compress_heuristic(&inode->vfs_inode, start, end); return 0; } @@ -552,7 +553,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(inode, start, end)) { + if (inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -616,11 +617,12 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(inode, start, end, 0, - BTRFS_COMPRESS_NONE, NULL); + ret = cow_file_range_inline(BTRFS_I(inode), start, end, + 0, BTRFS_COMPRESS_NONE, + NULL); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(inode, start, end, + ret = cow_file_range_inline(BTRFS_I(inode), start, end, total_compressed, compress_type, pages); } @@ -642,7 +644,8 @@ cont: * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, NULL, + extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + NULL, clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | @@ -762,14 +765,14 @@ static void free_async_extent_pages(struct async_extent *async_extent) */ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct inode *inode = async_chunk->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_extent *async_extent; u64 alloc_hint = 0; struct btrfs_key ins; struct extent_map *em; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_root *root = inode->root; + struct extent_io_tree *io_tree = &inode->io_tree; int ret = 0; again: @@ -802,7 +805,7 @@ retry: * all those pages down to the drive. */ if (!page_started && !ret) - extent_write_locked_range(inode, + extent_write_locked_range(&inode->vfs_inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, @@ -832,7 +835,7 @@ retry: * will not submit these pages down to lower * layers. */ - extent_range_redirty_for_io(inode, + extent_range_redirty_for_io(&inode->vfs_inode, async_extent->start, async_extent->start + async_extent->ram_size - 1); @@ -867,8 +870,7 @@ retry: BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { - btrfs_drop_extent_cache(BTRFS_I(inode), - async_extent->start, + btrfs_drop_extent_cache(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, 0); goto out_free_reserve; @@ -884,8 +886,7 @@ retry: NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - if (btrfs_submit_compressed_write(inode, - async_extent->start, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, @@ -896,12 +897,11 @@ retry: const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; - p->mapping = inode->i_mapping; + p->mapping = inode->vfs_inode.i_mapping; btrfs_writepage_endio_finish_ordered(p, start, end, 0); p->mapping = NULL; - extent_clear_unlock_delalloc(inode, start, end, - NULL, 0, + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); @@ -929,10 +929,10 @@ out_free: goto again; } -static u64 get_extent_allocation_hint(struct inode *inode, u64 start, +static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 num_bytes) { - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 alloc_hint = 0; @@ -974,13 +974,13 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, * required to start IO on it. It may be clean and already done with * IO when we return. */ -static noinline int cow_file_range(struct inode *inode, +static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; @@ -994,7 +994,7 @@ static noinline int cow_file_range(struct inode *inode, bool extent_reserved = false; int ret = 0; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(inode)) { WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; @@ -1004,7 +1004,7 @@ static noinline int cow_file_range(struct inode *inode, num_bytes = max(blocksize, num_bytes); ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); - inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K); + inode_should_defrag(inode, start, end, num_bytes, SZ_64K); if (start == 0) { /* lets try to make an inline extent */ @@ -1033,8 +1033,7 @@ static noinline int cow_file_range(struct inode *inode, } alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); - btrfs_drop_extent_cache(BTRFS_I(inode), start, - start + num_bytes - 1, 0); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); /* * Relocation relies on the relocated extents to have exactly the same @@ -1098,7 +1097,7 @@ static noinline int cow_file_range(struct inode *inode, * skip current ordered extent. */ if (ret) - btrfs_drop_extent_cache(BTRFS_I(inode), start, + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); } @@ -1114,8 +1113,7 @@ static noinline int cow_file_range(struct inode *inode, page_ops = unlock ? PAGE_UNLOCK : 0; page_ops |= PAGE_SET_PRIVATE2; - extent_clear_unlock_delalloc(inode, start, - start + ram_size - 1, + extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); @@ -1139,7 +1137,7 @@ out: return ret; out_drop_extent_cache: - btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0); + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1236,13 +1234,13 @@ static noinline void async_cow_free(struct btrfs_work *work) kvfree(async_chunk->pending); } -static int cow_file_range_async(struct inode *inode, +static int cow_file_range_async(struct btrfs_inode *inode, struct writeback_control *wbc, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; @@ -1254,9 +1252,9 @@ static int cow_file_range_async(struct inode *inode, unsigned nofs_flag; const unsigned int write_flags = wbc_to_write_flags(wbc); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && + if (inode->flags & BTRFS_INODE_NOCOMPRESS && !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { num_chunks = 1; should_compress = false; @@ -1294,9 +1292,9 @@ static int cow_file_range_async(struct inode *inode, * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime */ - ihold(inode); + ihold(&inode->vfs_inode); async_chunk[i].pending = &ctx->num_chunks; - async_chunk[i].inode = inode; + async_chunk[i].inode = &inode->vfs_inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; @@ -1373,15 +1371,15 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, return 1; } -static int fallback_to_cow(struct inode *inode, struct page *locked_page, +static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end, int *page_started, unsigned long *nr_written) { - const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode)); - const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid == + const bool is_space_ino = btrfs_is_free_space_inode(inode); + const bool is_reloc_ino = (inode->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); const u64 range_bytes = end + 1 - start; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; u64 count; @@ -1421,7 +1419,7 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page, EXTENT_NORESERVE, 0); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo; if (is_space_ino || is_reloc_ino) @@ -1447,21 +1445,21 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page, * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static noinline int run_delalloc_nocow(struct inode *inode, +static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end, int *page_started, int force, unsigned long *nr_written) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; u64 cur_offset = start; int ret; bool check_prev = true; - const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); - u64 ino = btrfs_ino(BTRFS_I(inode)); + const bool freespace_inode = btrfs_is_free_space_inode(inode); + u64 ino = btrfs_ino(inode); bool nocow = false; u64 disk_bytenr = 0; @@ -1687,8 +1685,8 @@ out_check: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, cow_start, - found_key.offset - 1, + ret = fallback_to_cow(inode, locked_page, + cow_start, found_key.offset - 1, page_started, nr_written); if (ret) goto error; @@ -1716,8 +1714,7 @@ out_check: num_bytes, BTRFS_ORDERED_PREALLOC); if (ret) { - btrfs_drop_extent_cache(BTRFS_I(inode), - cur_offset, + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + num_bytes - 1, 0); goto error; @@ -1793,11 +1790,11 @@ error: return ret; } -static inline int need_force_cow(struct inode *inode, u64 start, u64 end) +static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end) { - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) + if (!(inode->flags & BTRFS_INODE_NODATACOW) && + !(inode->flags & BTRFS_INODE_PREALLOC)) return 0; /* @@ -1805,9 +1802,8 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) * if is not zero, it means the file is defragging. * Force cow if given extent needs to be defragged. */ - if (BTRFS_I(inode)->defrag_bytes && - test_range_bit(&BTRFS_I(inode)->io_tree, start, end, - EXTENT_DEFRAG, 0, NULL)) + if (inode->defrag_bytes && + test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL)) return 1; return 0; @@ -1817,26 +1813,25 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc) { int ret; int force_cow = need_force_cow(inode, start, end); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { + if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { + } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + page_started, nr_written, 1); } else { - set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, page_started, nr_written); } @@ -2085,9 +2080,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & EXTENT_CLEAR_DATA_RESV)) - btrfs_free_reserved_data_space_noquota( - &inode->vfs_inode, - state->start, len); + btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -2163,7 +2156,7 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, struct inode *inode = private_data; blk_status_t ret = 0; - ret = btrfs_csum_one_bio(inode, bio, 0, 0); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -2228,7 +2221,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, 0, inode, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, 0, 0); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); if (ret) goto out; } @@ -2265,13 +2258,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, return 0; } -int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); - return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - extra_bits, cached_state); + return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, + cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2288,7 +2281,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct page *page; - struct inode *inode; + struct btrfs_inode *inode; u64 page_start; u64 page_end; int ret = 0; @@ -2296,7 +2289,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; - inode = fixup->inode; + inode = BTRFS_I(fixup->inode); page_start = page_offset(page); page_end = page_offset(page) + PAGE_SIZE - 1; @@ -2333,8 +2326,7 @@ again: * when the page was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE); + btrfs_delalloc_release_extents(inode, PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); @@ -2350,20 +2342,18 @@ again: if (ret) goto out_page; - lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - &cached_state); + lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) goto out_reserved; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, - PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, - page_end, &cached_state); + unlock_extent_cached(&inode->io_tree, page_start, page_end, + &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -2383,11 +2373,11 @@ again: BUG_ON(!PageDirty(page)); free_delalloc_space = false; out_reserved: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(inode, PAGE_SIZE); if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, + unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { @@ -2410,7 +2400,7 @@ out_page: * that could need flushing space. Recursing back to fixup worker would * deadlock. */ - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(&inode->vfs_inode); } /* @@ -2466,18 +2456,18 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, u64 file_pos, - u64 disk_bytenr, u64 disk_num_bytes, - u64 num_bytes, u64 ram_bytes, - u8 compression, u8 encryption, - u16 other_encoding, int extent_type) + struct btrfs_inode *inode, u64 file_pos, + struct btrfs_file_extent_item *stack_fi, + u64 qgroup_reserved) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_file_extent_item *fi; + struct btrfs_root *root = inode->root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; - u64 qg_released; + u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); + u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); + u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); + u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); int extent_inserted = 0; int ret; @@ -2496,60 +2486,42 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, */ ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, file_pos + num_bytes, NULL, 0, - 1, sizeof(*fi), &extent_inserted); + 1, sizeof(*stack_fi), &extent_inserted); if (ret) goto out; if (!extent_inserted) { - ins.objectid = btrfs_ino(BTRFS_I(inode)); + ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &ins, - sizeof(*fi)); + sizeof(*stack_fi)); if (ret) goto out; } leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_set_file_extent_type(leaf, fi, extent_type); - btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, fi, 0); - btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); - btrfs_set_file_extent_compression(leaf, fi, compression); - btrfs_set_file_extent_encryption(leaf, fi, encryption); - btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); + write_extent_buffer(leaf, stack_fi, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_file_extent_item)); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_add_bytes(inode, num_bytes); + inode_add_bytes(&inode->vfs_inode, num_bytes); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos, - ram_bytes); + ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) goto out; - /* - * Release the reserved range from inode dirty range map, as it is - * already moved into delayed_ref_head - */ - ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); - if (ret < 0) - goto out; - qg_released = ret; - ret = btrfs_alloc_reserved_file_extent(trans, root, - btrfs_ino(BTRFS_I(inode)), - file_pos, qg_released, &ins); + ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), + file_pos, qgroup_reserved, &ins); out: btrfs_free_path(path); @@ -2571,7 +2543,33 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); } -/* as ordered data IO finishes, this gets called so we can finish +static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_ordered_extent *oe) +{ + struct btrfs_file_extent_item stack_fi; + u64 logical_len; + + memset(&stack_fi, 0, sizeof(stack_fi)); + btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); + btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, + oe->disk_num_bytes); + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) + logical_len = oe->truncated_len; + else + logical_len = oe->num_bytes; + btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); + btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); + /* Encryption and other encoding is reserved and all 0 */ + + return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset, + &stack_fi, oe->qgroup_rsv); +} + +/* + * As ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. */ @@ -2622,13 +2620,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - /* - * For mwrite(mmap + memset to write) case, we still reserve - * space for NOCOW range. - * As NOCOW won't cause a new delayed ref, just free the space - */ - btrfs_qgroup_free_data(inode, NULL, start, - ordered_extent->num_bytes); btrfs_inode_safe_disk_i_size_write(inode, 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -2665,20 +2656,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - btrfs_qgroup_free_data(inode, NULL, start, - ordered_extent->num_bytes); ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_reserved_file_extent(trans, inode, start, - ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes, - logical_len, logical_len, - compress_type, 0, 0, - BTRFS_FILE_EXTENT_REG); + ret = insert_ordered_extent_file_extent(trans, inode, + ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, @@ -2830,6 +2815,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, io_bio->mirror_num); + if (io_bio->device) + btrfs_dev_stat_inc_and_print(io_bio->device, + BTRFS_DEV_STAT_CORRUPTION_ERRS); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); kunmap_atomic(kaddr); @@ -3348,6 +3336,14 @@ cache_index: */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + /* + * Same logic as for last_unlink_trans. We don't persist the generation + * of the last transaction where this inode was used for a reflink + * operation, so after eviction and reloading the inode we must be + * pessimistic and assume the last transaction that modified the inode. + */ + BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -3496,7 +3492,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, fill_inode_item(trans, leaf, inode_item, inode); btrfs_mark_buffer_dirty(leaf); - btrfs_set_inode_last_trans(trans, inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); ret = 0; failed: btrfs_free_path(path); @@ -3526,7 +3522,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) - btrfs_set_inode_last_trans(trans, inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); return ret; } @@ -4041,6 +4037,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) } } + free_anon_bdev(dest->anon_dev); + dest->anon_dev = 0; out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; @@ -4511,11 +4509,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; char *kaddr; + bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); + size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; @@ -4527,15 +4527,28 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - block_start, blocksize); - if (ret) + ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, + block_start, blocksize); + if (ret < 0) { + if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start, + &write_bytes) > 0) { + /* For nocow case, no need to reserve data space */ + only_release_metadata = true; + } else { + goto out; + } + } + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(BTRFS_I(inode), + data_reserved, block_start, blocksize); goto out; - + } again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; @@ -4560,7 +4573,7 @@ again: lock_extent_bits(io_tree, block_start, block_end, &cached_state); set_page_extent_mapped(page); - ordered = btrfs_lookup_ordered_extent(inode, block_start); + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start); if (ordered) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -4575,7 +4588,7 @@ again: EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); - ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0, &cached_state); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, @@ -4600,14 +4613,26 @@ again: set_page_dirty(page); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); + if (only_release_metadata) + set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, + block_end, EXTENT_NORESERVE, NULL, NULL, + GFP_NOFS); + out_unlock: - if (ret) - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (ret) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + blocksize, true); + else + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, + block_start, blocksize, true); + } btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: + if (only_release_metadata) + btrfs_check_nocow_unlock(BTRFS_I(inode)); extent_changeset_free(data_reserved); return ret; } @@ -4965,7 +4990,8 @@ static void evict_inode_truncate_pages(struct inode *inode) * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state_flags & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); + btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, + end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -6040,7 +6066,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_tree_add(inode); trace_btrfs_inode_new(inode); - btrfs_set_inode_last_trans(trans, inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); btrfs_update_root_times(trans, root); @@ -6849,7 +6875,7 @@ out: return em; } -static struct extent_map *btrfs_create_dio_extent(struct inode *inode, +static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const u64 start, const u64 len, const u64 orig_start, @@ -6863,21 +6889,19 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, int ret; if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(inode, start, len, orig_start, - block_start, block_len, orig_block_len, - ram_bytes, + em = create_io_em(inode, start, len, orig_start, block_start, + block_len, orig_block_len, ram_bytes, BTRFS_COMPRESS_NONE, /* compress_type */ type); if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent_dio(inode, start, block_start, - len, block_len, type); + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, + block_len, type); if (ret) { if (em) { free_extent_map(em); - btrfs_drop_extent_cache(BTRFS_I(inode), start, - start + len - 1, 0); + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); } em = ERR_PTR(ret); } @@ -6886,11 +6910,11 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, return em; } -static struct extent_map *btrfs_new_extent_direct(struct inode *inode, +static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, u64 start, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; @@ -6907,15 +6931,32 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, - ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, + 1); return em; } /* - * returns 1 when the nocow is safe, < 1 on error, 0 if the - * block must be cow'd + * Check if we can do nocow write into the range [@offset, @offset + @len) + * + * @offset: File offset + * @len: The length to write, will be updated to the nocow writeable + * range + * @orig_start: (optional) Return the original file offset of the file extent + * @orig_len: (optional) Return the original on-disk length of the file extent + * @ram_bytes: (optional) Return the ram_bytes of the file extent + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks for (nowait == false) case. + * + * Return: + * >0 and update @len if we can do nocow write + * 0 if we can't do nocow write + * <0 if error happened + * + * NOTE: This only checks the file extents, caller is responsible to wait for + * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, @@ -7142,8 +7183,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, } /* The callers of this must take lock_extent() */ -static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, - u64 orig_start, u64 block_start, +static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, + u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type) @@ -7157,7 +7198,7 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); - em_tree = &BTRFS_I(inode)->extent_tree; + em_tree = &inode->extent_tree; em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7179,8 +7220,8 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, } do { - btrfs_drop_extent_cache(BTRFS_I(inode), em->start, - em->start + em->len - 1, 0); + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); @@ -7259,7 +7300,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, btrfs_inc_nocow_writers(fs_info, block_start)) { struct extent_map *em2; - em2 = btrfs_create_dio_extent(inode, start, len, + em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, orig_start, block_start, len, orig_block_len, ram_bytes, type); @@ -7278,8 +7319,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * use the existing or preallocated extent, so does not * need to adjust btrfs_space_info's bytes_may_use. */ - btrfs_free_reserved_data_space_noquota(inode, start, - len); + btrfs_free_reserved_data_space_noquota(fs_info, len); goto skip_cow; } } @@ -7287,7 +7327,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, /* this will cow the extent */ len = bh_result->b_size; free_extent_map(em); - *map = em = btrfs_new_extent_direct(inode, start, len); + *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -7438,7 +7478,8 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { - __endio_write_update_ordered(dip->inode, dip->logical_offset, + __endio_write_update_ordered(BTRFS_I(dip->inode), + dip->logical_offset, dip->bytes, !dip->dio_bio->bi_status); } else { @@ -7524,18 +7565,18 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, return err; } -static void __endio_write_update_ordered(struct inode *inode, +static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered = NULL; struct btrfs_workqueue *wq; u64 ordered_offset = offset; u64 ordered_bytes = bytes; u64 last_offset; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; else wq = fs_info->endio_write_workers; @@ -7543,9 +7584,9 @@ static void __endio_write_update_ordered(struct inode *inode, while (ordered_offset < offset + bytes) { last_offset = ordered_offset; if (btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate)) { + &ordered_offset, + ordered_bytes, + uptodate)) { btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered->work); @@ -7572,7 +7613,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, { struct inode *inode = private_data; blk_status_t ret; - ret = btrfs_csum_one_bio(inode, bio, offset, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -7633,7 +7674,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(inode, bio, file_offset, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); if (ret) goto err; } else { @@ -7883,7 +7924,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, offset, count); if (ret) goto out; @@ -7915,8 +7956,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, data_reserved, - offset, dio_data.reserve, true); + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, offset, dio_data.reserve, + true); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -7925,13 +7967,13 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ if (dio_data.unsubmitted_oe_range_start < dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(inode, + __endio_write_update_ordered(BTRFS_I(inode), dio_data.unsubmitted_oe_range_start, dio_data.unsubmitted_oe_range_end - dio_data.unsubmitted_oe_range_start, false); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, offset, count - (size_t)ret, true); btrfs_delalloc_release_extents(BTRFS_I(inode), count); } @@ -7946,7 +7988,7 @@ out: } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) + u64 start, u64 len) { int ret; @@ -8133,7 +8175,7 @@ again: * bit of its io_tree, and free the qgroup reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | @@ -8197,8 +8239,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - reserved_space); + ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); if (!ret2) { ret2 = file_update_time(vmf->vma->vm_file); reserved = 1; @@ -8245,9 +8287,9 @@ again: fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE - reserved_space, - true); + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, page_start, + PAGE_SIZE - reserved_space, true); } } @@ -8262,7 +8304,7 @@ again: EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); - ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0, + ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); if (ret2) { unlock_extent_cached(io_tree, page_start, page_end, @@ -8302,7 +8344,7 @@ out_unlock: unlock_page(page); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(inode, data_reserved, page_start, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: sb_end_pagefault(inode->i_sb); @@ -8516,6 +8558,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; + ei->last_reflink_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -8602,7 +8645,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } - btrfs_qgroup_check_reserved_leak(inode); + btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); inode_tree_del(inode); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); @@ -9584,6 +9627,31 @@ out_unlock: return err; } +static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_key *ins, + u64 file_offset) +{ + struct btrfs_file_extent_item stack_fi; + u64 start = ins->objectid; + u64 len = ins->offset; + int ret; + + memset(&stack_fi, 0, sizeof(stack_fi)); + + btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); + btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); + btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); + /* Encryption and other encoding is reserved and all 0 */ + + ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); + if (ret < 0) + return ret; + return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset, + &stack_fi, ret); +} static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, @@ -9642,11 +9710,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_reserved_file_extent(trans, inode, - cur_offset, ins.objectid, - ins.offset, ins.offset, - ins.offset, 0, 0, 0, - BTRFS_FILE_EXTENT_PREALLOC); + ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); if (ret) { btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); @@ -9719,7 +9783,7 @@ next: btrfs_end_transaction(trans); } if (clear_offset < end) - btrfs_free_reserved_data_space(inode, NULL, clear_offset, + btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, end - clear_offset + 1); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e8f7c5f00894..bd3511c5ca81 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -164,8 +164,11 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg) return 0; } -/* Check if @flags are a supported and valid set of FS_*_FL flags */ -static int check_fsflags(unsigned int flags) +/* + * Check if @flags are a supported and valid set of FS_*_FL flags and that + * the old and new flags are not conflicting + */ +static int check_fsflags(unsigned int old_flags, unsigned int flags) { if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ @@ -174,9 +177,19 @@ static int check_fsflags(unsigned int flags) FS_NOCOW_FL)) return -EOPNOTSUPP; + /* COMPR and NOCOMP on new/old are valid */ if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) return -EINVAL; + if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL)) + return -EINVAL; + + /* NOCOW and compression options are mutually exclusive */ + if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL))) + return -EINVAL; + if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL))) + return -EINVAL; + return 0; } @@ -190,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; - u32 binode_flags = binode->flags; + u32 binode_flags; if (!inode_owner_or_capable(inode)) return -EPERM; @@ -201,22 +214,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (copy_from_user(&fsflags, arg, sizeof(fsflags))) return -EFAULT; - ret = check_fsflags(fsflags); - if (ret) - return ret; - ret = mnt_want_write_file(file); if (ret) return ret; inode_lock(inode); - fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); + ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); if (ret) goto out_unlock; + ret = check_fsflags(old_fsflags, fsflags); + if (ret) + goto out_unlock; + + binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) binode_flags |= BTRFS_INODE_SYNC; else @@ -566,6 +580,7 @@ static noinline int create_subvol(struct inode *dir, struct inode *inode; int ret; int err; + dev_t anon_dev = 0; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; @@ -578,6 +593,10 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail_free; + ret = get_anon_bdev(&anon_dev); + if (ret < 0) + goto fail_free; + /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assumes subvolume qgroup's level to be 0. @@ -660,12 +679,15 @@ static noinline int create_subvol(struct inode *dir, goto fail; key.offset = (u64)-1; - new_root = btrfs_get_fs_root(fs_info, objectid, true); + new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); if (IS_ERR(new_root)) { + free_anon_bdev(anon_dev); ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); goto fail; } + /* Freeing will be done in btrfs_put_root() of new_root */ + anon_dev = 0; btrfs_record_root_in_trans(trans, new_root); @@ -735,6 +757,8 @@ fail: return ret; fail_free: + if (anon_dev) + free_anon_bdev(anon_dev); kfree(root_item); return ret; } @@ -762,6 +786,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!pending_snapshot) return -ENOMEM; + ret = get_anon_bdev(&pending_snapshot->anon_dev); + if (ret < 0) + goto free_pending; pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), GFP_KERNEL); pending_snapshot->path = btrfs_alloc_path(); @@ -823,10 +850,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; + pending_snapshot->anon_dev = 0; fail: + /* Prevent double freeing of anon_dev */ + if (ret && pending_snapshot->snap) + pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); free_pending: + if (pending_snapshot->anon_dev) + free_anon_bdev(pending_snapshot->anon_dev); kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); kfree(pending_snapshot); @@ -1243,7 +1276,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); if (ret) @@ -1265,7 +1298,7 @@ again: while (1) { lock_extent_bits(tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), page_start); unlock_extent_cached(tree, page_start, page_end, &cached_state); @@ -1333,7 +1366,7 @@ again: spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, start_index << PAGE_SHIFT, (page_cnt - i_done) << PAGE_SHIFT, true); } @@ -1361,7 +1394,7 @@ out: unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT, true); btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); @@ -3198,11 +3231,15 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 flags_in; int ret = 0; - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); - if (!fi_args) - return -ENOMEM; + fi_args = memdup_user(arg, sizeof(*fi_args)); + if (IS_ERR(fi_args)) + return PTR_ERR(fi_args); + + flags_in = fi_args->flags; + memset(fi_args, 0, sizeof(*fi_args)); rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; @@ -3218,6 +3255,23 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; + if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) { + fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy); + fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy); + fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; + } + + if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { + fi_args->generation = fs_info->generation; + fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; + } + + if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) { + memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid, + sizeof(fi_args->metadata_uuid)); + fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e13b3d28c063..ebac13389e7e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -15,6 +15,7 @@ #include "disk-io.h" #include "compression.h" #include "delalloc-space.h" +#include "qgroup.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -152,23 +153,39 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/* allocate and add a new ordered_extent into the per-inode tree. +/* + * Allocate and add a new ordered_extent into the per-inode tree. * * The tree is given a single reference on the ordered extent that was * inserted. */ -static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, +static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type, int dio, int compress_type) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_ordered_inode_tree *tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry; + int ret; - tree = &BTRFS_I(inode)->ordered_tree; + if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { + /* For nocow write, we can release the qgroup rsv right now */ + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); + if (ret < 0) + return ret; + ret = 0; + } else { + /* + * The ordered extent has reserved qgroup space, release now + * and pass the reserved number for qgroup_record to free. + */ + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + if (ret < 0) + return ret; + } entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); if (!entry) return -ENOMEM; @@ -178,9 +195,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->num_bytes = num_bytes; entry->disk_num_bytes = disk_num_bytes; entry->bytes_left = num_bytes; - entry->inode = igrab(inode); + entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; + entry->qgroup_rsv = ret; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -197,10 +215,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - INIT_LIST_HEAD(&entry->log_list); - INIT_LIST_HEAD(&entry->trans_list); - trace_btrfs_ordered_extent_add(inode, entry); + trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry); spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, @@ -228,14 +244,14 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, * that work has been done at higher layers, so this is truly the * smallest the extent is going to get. */ - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); return 0; } -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { @@ -244,7 +260,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, BTRFS_COMPRESS_NONE); } -int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { @@ -253,7 +269,7 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, BTRFS_COMPRESS_NONE); } -int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type, int compress_type) @@ -291,12 +307,12 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, * file_offset is updated to one byte past the range that is recorded as * complete. This allows you to walk forward in the file. */ -int btrfs_dec_test_first_ordered_pending(struct inode *inode, +int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 *file_offset, u64 io_size, int uptodate) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_ordered_inode_tree *tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; int ret; @@ -305,7 +321,6 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, u64 dec_start; u64 to_dec; - tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); if (!node) { @@ -429,8 +444,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { - ASSERT(list_empty(&entry->log_list)); - ASSERT(list_empty(&entry->trans_list)); ASSERT(list_empty(&entry->root_extent_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) @@ -698,14 +711,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) * find an ordered extent corresponding to file_offset. return NULL if * nothing is found, otherwise take a reference on the extent and return it */ -struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) @@ -803,7 +816,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; - ordered = btrfs_lookup_ordered_extent(inode, offset); + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset); if (!ordered) return 0; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c01c9698250b..d61ea9c880a3 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -92,6 +92,9 @@ struct btrfs_ordered_extent { /* compression algorithm */ int compress_type; + /* Qgroup reserved space */ + int qgroup_rsv; + /* reference count */ refcount_t refs; @@ -101,12 +104,6 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; - /* If we need to wait on this to be done */ - struct list_head log_list; - - /* If the transaction needs to wait on this ordered extent */ - struct list_head trans_list; - /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -150,23 +147,23 @@ void btrfs_remove_ordered_extent(struct inode *inode, int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); -int btrfs_dec_test_first_ordered_pending(struct inode *inode, +int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 *file_offset, u64 io_size, int uptodate); -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); -int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); -int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type, int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); -struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry, int wait); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5bd4089ad0e1..c0f350c3a0cf 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -11,7 +11,6 @@ #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/btrfs.h> -#include <linux/sizes.h> #include "ctree.h" #include "transaction.h" @@ -22,6 +21,7 @@ #include "extent_io.h" #include "qgroup.h" #include "block-group.h" +#include "sysfs.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -220,10 +220,12 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) +static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, @@ -252,7 +254,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); - __del_qgroup_rb(qgroup); + __del_qgroup_rb(fs_info, qgroup); return 0; } @@ -351,6 +353,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } + ret = btrfs_sysfs_add_qgroups(fs_info); + if (ret < 0) + goto out; /* default this to quota off, in case no status key is found */ fs_info->qgroup_flags = 0; @@ -412,6 +417,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + switch (found_key.type) { case BTRFS_QGROUP_INFO_KEY: { struct btrfs_qgroup_info_item *ptr; @@ -500,12 +509,51 @@ out: ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + btrfs_sysfs_del_qgroups(fs_info); } return ret < 0 ? ret : 0; } /* + * Called in close_ctree() when quota is still enabled. This verifies we don't + * leak some reserved space. + * + * Return false if no reserved space is left. + * Return true if some reserved space is leaked. + */ +bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) +{ + struct rb_node *node; + bool ret = false; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return ret; + /* + * Since we're unmounting, there is no race and no need to grab qgroup + * lock. And here we don't go post-order to provide a more user + * friendly sorted result. + */ + for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { + struct btrfs_qgroup *qgroup; + int i; + + qgroup = rb_entry(node, struct btrfs_qgroup, node); + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { + if (qgroup->rsv.values[i]) { + ret = true; + btrfs_warn(fs_info, + "qgroup %hu/%llu has unreleased space, type %d rsv %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + i, qgroup->rsv.values[i]); + } + } + } + return ret; +} + +/* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), * first two are in single-threaded paths.And for the third one, we have set * quota_root to be null with qgroup_lock held before, so it is safe to clean @@ -519,7 +567,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - __del_qgroup_rb(qgroup); + __del_qgroup_rb(fs_info, qgroup); } /* * We call btrfs_free_qgroup_config() when unmounting @@ -528,6 +576,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) */ ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; + btrfs_sysfs_del_qgroups(fs_info); } static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, @@ -900,6 +949,9 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out; } + ret = btrfs_sysfs_add_qgroups(fs_info); + if (ret < 0) + goto out; /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item @@ -987,6 +1039,11 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_abort_transaction(trans, ret); goto out_free_path; } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } } ret = btrfs_next_item(tree_root, path); if (ret < 0) { @@ -1011,6 +1068,11 @@ out_add_root: btrfs_abort_transaction(trans, ret); goto out_free_path; } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } ret = btrfs_commit_transaction(trans); trans = NULL; @@ -1046,6 +1108,7 @@ out: fs_info->qgroup_ulist = NULL; if (trans) btrfs_end_transaction(trans); + btrfs_sysfs_del_qgroups(fs_info); } mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -1398,8 +1461,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) qgroup = add_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); - if (IS_ERR(qgroup)) + if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); + goto out; + } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -2818,6 +2884,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, unlock: spin_unlock(&fs_info->qgroup_lock); + if (!ret) + ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); @@ -2826,20 +2894,8 @@ out: return ret; } -/* - * Two limits to commit transaction in advance. - * - * For RATIO, it will be 1/RATIO of the remaining limit as threshold. - * For SIZE, it will be in byte unit as threshold. - */ -#define QGROUP_FREE_RATIO 32 -#define QGROUP_FREE_SIZE SZ_32M -static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup *qg, u64 num_bytes) +static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) { - u64 free; - u64 threshold; - if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; @@ -2848,32 +2904,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; - /* - * Even if we passed the check, it's better to check if reservation - * for meta_pertrans is pushing us near limit. - * If there is too much pertrans reservation or it's near the limit, - * let's try commit transaction to free some, using transaction_kthread - */ - if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | - BTRFS_QGROUP_LIMIT_MAX_EXCL))) { - if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { - free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl; - threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO, - QGROUP_FREE_SIZE); - } else { - free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer; - threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO, - QGROUP_FREE_SIZE); - } - - /* - * Use transaction_kthread to commit transaction, so we no - * longer need to bother nested transaction nor lock context. - */ - if (free < threshold) - btrfs_commit_transaction_locksafe(fs_info); - } - return true; } @@ -2921,7 +2951,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, qg = unode_aux_to_qgroup(unode); - if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) { + if (enforce && !qgroup_check_limits(qg, num_bytes)) { ret = -EDQUOT; goto out; } @@ -3378,28 +3408,132 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) } } +#define rbtree_iterate_from_safe(node, next, start) \ + for (node = start; node && ({ next = rb_next(node); 1;}); node = next) + +static int qgroup_unreserve_range(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, + u64 len) +{ + struct rb_node *node; + struct rb_node *next; + struct ulist_node *entry = NULL; + int ret = 0; + + node = reserved->range_changed.root.rb_node; + while (node) { + entry = rb_entry(node, struct ulist_node, rb_node); + if (entry->val < start) + node = node->rb_right; + else if (entry) + node = node->rb_left; + else + break; + } + + /* Empty changeset */ + if (!entry) + return 0; + + if (entry->val > start && rb_prev(&entry->rb_node)) + entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, + rb_node); + + rbtree_iterate_from_safe(node, next, &entry->rb_node) { + u64 entry_start; + u64 entry_end; + u64 entry_len; + int clear_ret; + + entry = rb_entry(node, struct ulist_node, rb_node); + entry_start = entry->val; + entry_end = entry->aux; + entry_len = entry_end - entry_start + 1; + + if (entry_start >= start + len) + break; + if (entry_start + entry_len <= start) + continue; + /* + * Now the entry is in [start, start + len), revert the + * EXTENT_QGROUP_RESERVED bit. + */ + clear_ret = clear_extent_bits(&inode->io_tree, entry_start, + entry_end, EXTENT_QGROUP_RESERVED); + if (!ret && clear_ret < 0) + ret = clear_ret; + + ulist_del(&reserved->range_changed, entry->val, entry->aux); + if (likely(reserved->bytes_changed >= entry_len)) { + reserved->bytes_changed -= entry_len; + } else { + WARN_ON(1); + reserved->bytes_changed = 0; + } + } + + return ret; +} + /* - * Reserve qgroup space for range [start, start + len). + * Try to free some space for qgroup. * - * This function will either reserve space from related qgroups or doing - * nothing if the range is already reserved. + * For qgroup, there are only 3 ways to free qgroup space: + * - Flush nodatacow write + * Any nodatacow write will free its reserved data space at run_delalloc_range(). + * In theory, we should only flush nodatacow inodes, but it's not yet + * possible, so we need to flush the whole root. * - * Return 0 for successful reserve - * Return <0 for error (including -EQUOT) + * - Wait for ordered extents + * When ordered extents are finished, their reserved metadata is finally + * converted to per_trans status, which can be freed by later commit + * transaction. * - * NOTE: this function may sleep for memory allocation. - * if btrfs_qgroup_reserve_data() is called multiple times with - * same @reserved, caller must ensure when error happens it's OK - * to free *ALL* reserved space. + * - Commit transaction + * This would free the meta_per_trans space. + * In theory this shouldn't provide much space, but any more qgroup space + * is needed. */ -int btrfs_qgroup_reserve_data(struct inode *inode, +static int try_flush_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * We don't want to run flush again and again, so if there is a running + * one, we won't try to start a new flush, but exit directly. + */ + if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { + wait_event(root->qgroup_flush_wait, + !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); + return 0; + } + + ret = btrfs_start_delalloc_snapshot(root); + if (ret < 0) + goto out; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); + wake_up(&root->qgroup_flush_wait); + return ret; +} + +static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct ulist_node *unode; - struct ulist_iterator uiter; + struct btrfs_root *root = inode->root; struct extent_changeset *reserved; + bool new_reserved = false; u64 orig_reserved; u64 to_reserve; int ret; @@ -3412,6 +3546,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, if (WARN_ON(!reserved_ret)) return -EINVAL; if (!*reserved_ret) { + new_reserved = true; *reserved_ret = extent_changeset_alloc(); if (!*reserved_ret) return -ENOMEM; @@ -3419,15 +3554,15 @@ int btrfs_qgroup_reserve_data(struct inode *inode, reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; - ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + ret = set_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, reserved); /* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; - trace_btrfs_qgroup_reserve_data(inode, start, len, + trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, to_reserve, QGROUP_RESERVE); if (ret < 0) - goto cleanup; + goto out; ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); if (ret < 0) goto cleanup; @@ -3435,23 +3570,49 @@ int btrfs_qgroup_reserve_data(struct inode *inode, return ret; cleanup: - /* cleanup *ALL* already reserved ranges */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&reserved->range_changed, &uiter))) - clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, - unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); - /* Also free data bytes of already reserved one */ - btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, - orig_reserved, BTRFS_QGROUP_RSV_DATA); - extent_changeset_release(reserved); + qgroup_unreserve_range(inode, reserved, start, len); +out: + if (new_reserved) { + extent_changeset_release(reserved); + kfree(reserved); + *reserved_ret = NULL; + } return ret; } +/* + * Reserve qgroup space for range [start, start + len). + * + * This function will either reserve space from related qgroups or do nothing + * if the range is already reserved. + * + * Return 0 for successful reservation + * Return <0 for error (including -EQUOT) + * + * NOTE: This function may sleep for memory allocation, dirty page flushing and + * commit transaction. So caller should not hold any dirty page locked. + */ +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) +{ + int ret; + + ret = qgroup_reserve_data(inode, reserved_ret, start, len); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(inode->root); + if (ret < 0) + return ret; + return qgroup_reserve_data(inode, reserved_ret, start, len); +} + /* Free ranges specified by @reserved, normally in error path */ -static int qgroup_free_reserved_data(struct inode *inode, +static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; @@ -3487,8 +3648,8 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, - free_start, free_start + free_len - 1, + ret = clear_record_extent_bits(&inode->io_tree, free_start, + free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -3502,7 +3663,7 @@ out: return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, +static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, int free) { @@ -3510,8 +3671,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, int trace_op = QGROUP_RELEASE; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, - &BTRFS_I(inode)->root->fs_info->flags)) + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) return 0; /* In release case, we shouldn't have @reserved */ @@ -3519,18 +3679,18 @@ static int __btrfs_qgroup_release_data(struct inode *inode, if (free && reserved) return qgroup_free_reserved_data(inode, reserved, start, len); extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, + EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; if (free) trace_op = QGROUP_FREE; - trace_btrfs_qgroup_release_data(inode, start, len, + trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, changeset.bytes_changed, trace_op); if (free) - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->root_key.objectid, + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); ret = changeset.bytes_changed; out: @@ -3550,7 +3710,7 @@ out: * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, +int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); @@ -3571,7 +3731,7 @@ int btrfs_qgroup_free_data(struct inode *inode, * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) { return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } @@ -3616,7 +3776,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; } -int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3643,6 +3803,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return ret; } +int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) +{ + int ret; + + ret = qgroup_reserve_meta(root, num_bytes, type, enforce); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(root); + if (ret < 0) + return ret; + return qgroup_reserve_meta(root, num_bytes, type, enforce); +} + void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3742,7 +3917,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) * Check qgroup reserved space leaking, normally at destroy inode * time */ -void btrfs_qgroup_check_reserved_leak(struct inode *inode) +void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) { struct extent_changeset changeset; struct ulist_node *unode; @@ -3750,19 +3925,19 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) int ret; extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { ULIST_ITER_INIT(&iter); while ((unode = ulist_next(&changeset.range_changed, &iter))) { - btrfs_warn(BTRFS_I(inode)->root->fs_info, - "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu", - inode->i_ino, unode->val, unode->aux); + btrfs_warn(inode->root->fs_info, + "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", + btrfs_ino(inode), unode->val, unode->aux); } - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->root_key.objectid, + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 1bc654459469..50dea9a2d8fb 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -8,6 +8,7 @@ #include <linux/spinlock.h> #include <linux/rbtree.h> +#include <linux/kobject.h> #include "ulist.h" #include "delayed-ref.h" @@ -223,8 +224,18 @@ struct btrfs_qgroup { */ u64 old_refcnt; u64 new_refcnt; + + /* + * Sysfs kobjectid + */ + struct kobject kobj; }; +static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) +{ + return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); +} + /* * For qgroup event trace points only */ @@ -344,12 +355,12 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif /* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); -int btrfs_qgroup_free_data(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len); - +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, + u64 len); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); /* Reserve metadata space for pertrans and prealloc type */ @@ -399,7 +410,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); -void btrfs_qgroup_check_reserved_leak(struct inode *inode); +void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode); /* btrfs_qgroup_swapped_blocks related functions */ void btrfs_qgroup_init_swapped_blocks( @@ -415,5 +426,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); +bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c870ef70f817..255490f42b5d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1083,7 +1083,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, unsigned long bio_max_len) { struct bio *last = bio_list->tail; - u64 last_end = 0; int ret; struct bio *bio; struct btrfs_bio_stripe *stripe; @@ -1098,15 +1097,14 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - last_end = (u64)last->bi_iter.bi_sector << 9; + u64 last_end = (u64)last->bi_iter.bi_sector << 9; last_end += last->bi_iter.bi_size; /* * we can't merge these if they are from different * devices or if they are not contiguous */ - if (last_end == disk_start && stripe->dev->bdev && - !last->bi_status && + if (last_end == disk_start && !last->bi_status && last->bi_disk == stripe->dev->bdev->bd_disk && last->bi_partno == stripe->dev->bdev->bd_partno) { ret = bio_add_page(last, page, PAGE_SIZE, 0); @@ -1117,6 +1115,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* put a new bio on the list */ bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); + btrfs_io_bio(bio)->device = stripe->dev; bio->bi_iter.bi_size = 0; bio_set_dev(bio, stripe->dev->bdev); bio->bi_iter.bi_sector = disk_start >> 9; @@ -1325,11 +1324,7 @@ write_data: atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; bio->bi_opf = REQ_OP_WRITE; @@ -1354,7 +1349,6 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { u64 physical = bio->bi_iter.bi_sector; - u64 stripe_start; int i; struct btrfs_bio_stripe *stripe; @@ -1362,9 +1356,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bbio->num_stripes; i++) { stripe = &rbio->bbio->stripes[i]; - stripe_start = stripe->physical; - if (physical >= stripe_start && - physical < stripe_start + rbio->stripe_len && + if (in_range(physical, stripe->physical, rbio->stripe_len) && stripe->dev->bdev && bio->bi_disk == stripe->dev->bdev->bd_disk && bio->bi_partno == stripe->dev->bdev->bd_partno) { @@ -1382,18 +1374,14 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 logical = bio->bi_iter.bi_sector; - u64 stripe_start; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; int i; - logical <<= 9; - for (i = 0; i < rbio->nr_data; i++) { - stripe_start = rbio->bbio->raid_map[i]; - if (logical >= stripe_start && - logical < stripe_start + rbio->stripe_len) { + u64 stripe_start = rbio->bbio->raid_map[i]; + + if (in_range(logical, stripe_start, rbio->stripe_len)) return i; - } } return -1; } @@ -1567,11 +1555,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * not to touch it after that */ atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; bio->bi_opf = REQ_OP_READ; @@ -1878,11 +1862,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* make sure our ps and qs are in order */ - if (faila > failb) { - int tmp = failb; - failb = faila; - faila = tmp; - } + if (faila > failb) + swap(faila, failb); /* if the q stripe is failed, do a pstripe reconstruction * from the xors. @@ -2102,7 +2083,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) */ if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { __raid_recover_end_io(rbio); - goto out; + return 0; } else { goto cleanup; } @@ -2113,11 +2094,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * not to touch it after that */ atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; bio->bi_opf = REQ_OP_READ; @@ -2126,7 +2103,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) submit_bio(bio); } -out: + return 0; cleanup: @@ -2482,11 +2459,7 @@ submit_write: atomic_set(&rbio->stripes_pending, nr_data); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; bio->bi_opf = REQ_OP_WRITE; @@ -2664,11 +2637,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * not to touch it after that */ atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; bio->bi_opf = REQ_OP_READ; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index af92525dbb16..7f03dbe5b609 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -286,6 +286,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, exist_re = insert_root_entry(&exist->roots, re); if (exist_re) kfree(re); + } else { + kfree(re); } kfree(be); return exist; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 040009d1cc31..5cd02514cf4d 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -68,8 +68,8 @@ static int copy_inline_to_page(struct inode *inode, * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, - block_size); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + file_offset, block_size); if (ret) goto out; @@ -84,7 +84,8 @@ static int copy_inline_to_page(struct inode *inode, clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); - ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end, + 0, NULL); if (ret) goto out_unlock; @@ -133,8 +134,8 @@ out_unlock: put_page(page); } if (ret) - btrfs_delalloc_release_space(inode, data_reserved, file_offset, - block_size, true); + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, + file_offset, block_size, true); btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); out: extent_changeset_free(data_reserved); @@ -336,6 +337,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, while (1) { u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; + u64 extent_gen; int type; u32 size; struct btrfs_key new_key; @@ -384,6 +386,7 @@ process_slot: extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + extent_gen = btrfs_file_extent_generation(leaf, extent); comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG || @@ -488,6 +491,19 @@ process_slot: btrfs_release_path(path); + /* + * If this is a new extent update the last_reflink_trans of both + * inodes. This is used by fsync to make sure it does not log + * multiple checksum items with overlapping ranges. For older + * extents we don't need to do it since inode logging skips the + * checksums for older extents. Also ignore holes and inline + * extents because they don't have checksums in the csum tree. + */ + if (extent_gen == trans->transid && disko > 0) { + BTRFS_I(src)->last_reflink_trans = trans->transid; + BTRFS_I(inode)->last_reflink_trans = trans->transid; + } + last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3bbae80c752f..4ba1ab9cc76d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1686,12 +1686,20 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_unlock_up_safe(path, 0); } - min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + /* + * In merge_reloc_root(), we modify the upper level pointer to swap the + * tree blocks between reloc tree and subvolume tree. Thus for tree + * block COW, we COW at most from level 1 to root level for each tree. + * + * Thus the needed metadata size is at most root_level * nodesize, + * and * 2 since we have two trees to COW. + */ + min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2; memset(&next_key, 0, sizeof(next_key)); while (1) { ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, - BTRFS_RESERVE_FLUSH_ALL); + BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { err = ret; goto out; @@ -2571,58 +2579,50 @@ out_free_blocks: return err; } -static noinline_for_stack -int prealloc_file_extent_cluster(struct inode *inode, - struct file_extent_cluster *cluster) +static noinline_for_stack int prealloc_file_extent_cluster( + struct btrfs_inode *inode, + struct file_extent_cluster *cluster) { u64 alloc_hint = 0; u64 start; u64 end; - u64 offset = BTRFS_I(inode)->index_cnt; + u64 offset = inode->index_cnt; u64 num_bytes; - int nr = 0; + int nr; int ret = 0; u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; - u64 cur_offset; - struct extent_changeset *data_reserved = NULL; + u64 cur_offset = prealloc_start; BUG_ON(cluster->start != cluster->boundary[0]); - inode_lock(inode); - - ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, - prealloc_end + 1 - prealloc_start); + ret = btrfs_alloc_data_chunk_ondemand(inode, + prealloc_end + 1 - prealloc_start); if (ret) - goto out; + return ret; - cur_offset = prealloc_start; - while (nr < cluster->nr) { + inode_lock(&inode->vfs_inode); + for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) end = cluster->boundary[nr + 1] - 1 - offset; else end = cluster->end - offset; - lock_extent(&BTRFS_I(inode)->io_tree, start, end); + lock_extent(&inode->io_tree, start, end); num_bytes = end + 1 - start; - if (cur_offset < start) - btrfs_free_reserved_data_space(inode, data_reserved, - cur_offset, start - cur_offset); - ret = btrfs_prealloc_file_range(inode, 0, start, + ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end); if (ret) break; - nr++; } + inode_unlock(&inode->vfs_inode); + if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space(inode, data_reserved, - cur_offset, prealloc_end + 1 - cur_offset); -out: - inode_unlock(inode); - extent_changeset_free(data_reserved); + btrfs_free_reserved_data_space_noquota(inode->root->fs_info, + prealloc_end + 1 - cur_offset); return ret; } @@ -2664,7 +2664,8 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, */ int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) { - return atomic_read(&fs_info->balance_cancel_req); + return atomic_read(&fs_info->balance_cancel_req) || + fatal_signal_pending(current); } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); @@ -2690,7 +2691,7 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!ra) return -ENOMEM; - ret = prealloc_file_extent_cluster(inode, cluster); + ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); if (ret) goto out; @@ -2762,8 +2763,8 @@ static int relocate_file_extent_cluster(struct inode *inode, nr++; } - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, - NULL); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, + page_end, 0, NULL); if (ret) { unlock_page(page); put_page(page); @@ -3872,9 +3873,9 @@ out: * cloning checksum properly handles the nodatasum extents. * it also saves CPU time to re-calculate the checksum. */ -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered; int ret; @@ -3885,7 +3886,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) ordered = btrfs_lookup_ordered_extent(inode, file_pos); BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); - disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + disk_bytenr = file_pos + inode->index_cnt; ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, disk_bytenr + len - 1, &list, 0); if (ret) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 016a025e36c7..5a6cb9db512e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1616,13 +1616,9 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, struct scrub_page *spage = sblock->pagev[page_num]; BUG_ON(spage->page == NULL); - if (spage->io_error) { - void *mapped_buffer = kmap_atomic(spage->page); + if (spage->io_error) + clear_page(page_address(spage->page)); - clear_page(mapped_buffer); - flush_dcache_page(spage->page); - kunmap_atomic(mapped_buffer); - } return scrub_add_page_to_wr_bio(sblock->sctx, spage); } @@ -1790,42 +1786,21 @@ static int scrub_checksum_data(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; - u8 *on_disk_csum; - struct page *page; - void *buffer; - u64 len; - int index; + struct scrub_page *spage; + char *kaddr; BUG_ON(sblock->page_count < 1); - if (!sblock->pagev[0]->have_csum) + spage = sblock->pagev[0]; + if (!spage->have_csum) return 0; + kaddr = page_address(spage->page); + shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); + crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); - on_disk_csum = sblock->pagev[0]->csum; - page = sblock->pagev[0]->page; - buffer = kmap_atomic(page); - - len = sctx->fs_info->sectorsize; - index = 0; - for (;;) { - u64 l = min_t(u64, len, PAGE_SIZE); - - crypto_shash_update(shash, buffer, l); - kunmap_atomic(buffer); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index]->page); - page = sblock->pagev[index]->page; - buffer = kmap_atomic(page); - } - - crypto_shash_final(shash, csum); - if (memcmp(csum, on_disk_csum, sctx->csum_size)) + if (memcmp(csum, spage->csum, sctx->csum_size)) sblock->checksum_error = 1; return sblock->checksum_error; @@ -1839,20 +1814,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; - struct page *page; - void *mapped_buffer; - u64 mapped_size; - void *p; - u64 len; - int index; - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); + const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT; + int i; + struct scrub_page *spage; + char *kaddr; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0]->page; - mapped_buffer = kmap_atomic(page); - h = (struct btrfs_header *)mapped_buffer; + spage = sblock->pagev[0]; + kaddr = page_address(spage->page); + h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->csum_size); /* @@ -1860,40 +1830,29 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) + if (spage->logical != btrfs_stack_header_bytenr(h)) sblock->header_error = 1; - if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) { + if (spage->generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } - if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) + if (!scrub_check_fsid(h->fsid, spage)) sblock->header_error = 1; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) sblock->header_error = 1; - len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, + PAGE_SIZE - BTRFS_CSUM_SIZE); - crypto_shash_update(shash, p, l); - kunmap_atomic(mapped_buffer); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index]->page); - page = sblock->pagev[index]->page; - mapped_buffer = kmap_atomic(page); - mapped_size = PAGE_SIZE; - p = mapped_buffer; + for (i = 1; i < num_pages; i++) { + kaddr = page_address(sblock->pagev[i]->page); + crypto_shash_update(shash, kaddr, PAGE_SIZE); } crypto_shash_final(shash, calculated_csum); @@ -1910,57 +1869,31 @@ static int scrub_checksum_super(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; - u8 on_disk_csum[BTRFS_CSUM_SIZE]; - struct page *page; - void *mapped_buffer; - u64 mapped_size; - void *p; + struct scrub_page *spage; + char *kaddr; int fail_gen = 0; int fail_cor = 0; - u64 len; - int index; - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0]->page; - mapped_buffer = kmap_atomic(page); - s = (struct btrfs_super_block *)mapped_buffer; - memcpy(on_disk_csum, s->csum, sctx->csum_size); + spage = sblock->pagev[0]; + kaddr = page_address(spage->page); + s = (struct btrfs_super_block *)kaddr; - if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) + if (spage->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (sblock->pagev[0]->generation != btrfs_super_generation(s)) + if (spage->generation != btrfs_super_generation(s)) ++fail_gen; - if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) + if (!scrub_check_fsid(s->fsid, spage)) ++fail_cor; - len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); - - crypto_shash_update(shash, p, l); - kunmap_atomic(mapped_buffer); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index]->page); - page = sblock->pagev[index]->page; - mapped_buffer = kmap_atomic(page); - mapped_size = PAGE_SIZE; - p = mapped_buffer; - } + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); - crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) + if (memcmp(calculated_csum, s->csum, sctx->csum_size)) ++fail_cor; if (fail_cor + fail_gen) { @@ -1973,10 +1906,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + btrfs_dev_stat_inc_and_print(spage->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + btrfs_dev_stat_inc_and_print(spage->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -3758,7 +3691,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) - return -EIO; + return -EROFS; /* Seed devices of a new filesystem has their own generation. */ if (scrub_dev->fs_devices != fs_info->fs_devices) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c7bd3fdd7792..475968ccbd1d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -468,8 +468,8 @@ again: "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", cache->start, cache->length, cache->used, cache->pinned, cache->reserved, cache->ro ? "[readonly]" : ""); - btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); + btrfs_dump_free_space(cache, bytes); } if (++index < BTRFS_NR_RAID_TYPES) goto again; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c3826ae883f0..5a9dc31d95c9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -67,6 +67,21 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); +/* + * Generally the error codes correspond to their respective errors, but there + * are a few special cases. + * + * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for + * instance will return EUCLEAN if any of the blocks are corrupted in + * a way that is problematic. We want to reserve EUCLEAN for these + * sort of corruptions. + * + * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we + * need to use EROFS for this case. We will have no idea of the + * original failure, that will have been reported at the time we tripped + * over the error. Each subsequent error that doesn't have any context + * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. + */ const char * __attribute_const__ btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -326,7 +341,6 @@ enum { Opt_defrag, Opt_nodefrag, Opt_discard, Opt_nodiscard, Opt_discard_mode, - Opt_nologreplay, Opt_norecovery, Opt_ratio, Opt_rescan_uuid_tree, @@ -340,13 +354,15 @@ enum { Opt_subvolid, Opt_thread_pool, Opt_treelog, Opt_notreelog, - Opt_usebackuproot, Opt_user_subvol_rm_allowed, + /* Rescue options */ + Opt_rescue, + Opt_usebackuproot, + Opt_nologreplay, + /* Deprecated options */ - Opt_alloc_start, Opt_recovery, - Opt_subvolrootid, /* Debugging options */ Opt_check_integrity, @@ -390,7 +406,6 @@ static const match_table_t tokens = { {Opt_discard, "discard"}, {Opt_discard_mode, "discard=%s"}, {Opt_nodiscard, "nodiscard"}, - {Opt_nologreplay, "nologreplay"}, {Opt_norecovery, "norecovery"}, {Opt_ratio, "metadata_ratio=%u"}, {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, @@ -408,13 +423,17 @@ static const match_table_t tokens = { {Opt_thread_pool, "thread_pool=%u"}, {Opt_treelog, "treelog"}, {Opt_notreelog, "notreelog"}, - {Opt_usebackuproot, "usebackuproot"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + /* Rescue options */ + {Opt_rescue, "rescue=%s"}, + /* Deprecated, with alias rescue=nologreplay */ + {Opt_nologreplay, "nologreplay"}, + /* Deprecated, with alias rescue=usebackuproot */ + {Opt_usebackuproot, "usebackuproot"}, + /* Deprecated options */ - {Opt_alloc_start, "alloc_start=%s"}, {Opt_recovery, "recovery"}, - {Opt_subvolrootid, "subvolrootid=%d"}, /* Debugging options */ {Opt_check_integrity, "check_int"}, @@ -433,6 +452,55 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; +static const match_table_t rescue_tokens = { + {Opt_usebackuproot, "usebackuproot"}, + {Opt_nologreplay, "nologreplay"}, + {Opt_err, NULL}, +}; + +static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) +{ + char *opts; + char *orig; + char *p; + substring_t args[MAX_OPT_ARGS]; + int ret = 0; + + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ":")) != NULL) { + int token; + + if (!*p) + continue; + token = match_token(p, rescue_tokens, args); + switch (token){ + case Opt_usebackuproot: + btrfs_info(info, + "trying to use backup root at mount time"); + btrfs_set_opt(info->mount_opt, USEBACKUPROOT); + break; + case Opt_nologreplay: + btrfs_set_and_info(info, NOLOGREPLAY, + "disabling log replay at mount time"); + break; + case Opt_err: + btrfs_info(info, "unrecognized rescue option '%s'", p); + ret = -EINVAL; + goto out; + default: + break; + } + + } +out: + kfree(orig); + return ret; +} + /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. @@ -479,7 +547,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_subvol: case Opt_subvol_empty: case Opt_subvolid: - case Opt_subvolrootid: case Opt_device: /* * These are parsed by btrfs_parse_subvol_options or @@ -663,10 +730,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; } break; - case Opt_alloc_start: - btrfs_info(info, - "option alloc_start is obsolete, ignored"); - break; case Opt_acl: #ifdef CONFIG_BTRFS_FS_POSIX_ACL info->sb->s_flags |= SB_POSIXACL; @@ -689,6 +752,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_norecovery: case Opt_nologreplay: + btrfs_warn(info, + "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; @@ -762,6 +827,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } break; case Opt_inode_cache: + btrfs_warn(info, + "the 'inode_cache' option is deprecated and will have no effect from 5.11"); btrfs_set_pending_and_info(info, INODE_MAP_CACHE, "enabling inode map caching"); break; @@ -791,10 +858,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, "disabling auto defrag"); break; case Opt_recovery: - btrfs_warn(info, - "'recovery' is deprecated, use 'usebackuproot' instead"); - fallthrough; case Opt_usebackuproot: + btrfs_warn(info, + "'%s' is deprecated, use 'rescue=usebackuproot' instead", + token == Opt_recovery ? "recovery" : + "usebackuproot"); btrfs_info(info, "trying to use backup root at mount time"); btrfs_set_opt(info->mount_opt, USEBACKUPROOT); @@ -859,6 +927,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } info->commit_interval = intarg; break; + case Opt_rescue: + ret = parse_rescue_options(info, args[0].from); + if (ret < 0) + goto out; + break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: btrfs_info(info, "fragmenting all space"); @@ -1020,9 +1093,6 @@ static int btrfs_parse_subvol_options(const char *options, char **subvol_name, *subvol_objectid = subvolid; break; - case Opt_subvolrootid: - pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n"); - break; default: break; } @@ -1344,7 +1414,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(info, NOLOGREPLAY)) - seq_puts(seq, ",nologreplay"); + seq_puts(seq, ",rescue=nologreplay"); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) @@ -1712,11 +1782,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } -static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) -{ - set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); -} - static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, unsigned long old_opts, int flags) { @@ -1750,8 +1815,6 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && !btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_cleanup(fs_info); - - clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) @@ -1767,7 +1830,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) int ret; sync_filesystem(sb); - btrfs_remount_prepare(fs_info); + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); if (data) { void *new_sec_opts = NULL; @@ -1889,6 +1952,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) out: wake_up_process(fs_info->transaction_kthread); btrfs_remount_cleanup(fs_info, old_opts); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + return 0; restore: @@ -1903,6 +1968,8 @@ restore: old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; btrfs_remount_cleanup(fs_info, old_opts); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + return ret; } @@ -2296,9 +2363,7 @@ static int btrfs_unfreeze(struct super_block *sb) static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); - struct btrfs_fs_devices *cur_devices; struct btrfs_device *dev, *first_dev = NULL; - struct list_head *head; /* * Lightweight locking of the devices. We should not need @@ -2308,18 +2373,13 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * least until the rcu_read_unlock. */ rcu_read_lock(); - cur_devices = fs_info->fs_devices; - while (cur_devices) { - head = &cur_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) - continue; - if (!dev->name) - continue; - if (!first_dev || dev->devid < first_dev->devid) - first_dev = dev; - } - cur_devices = cur_devices->seed; + list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (!dev->name) + continue; + if (!first_dev || dev->devid < first_dev->devid) + first_dev = dev; } if (first_dev) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a39bff64ff24..104c80caaa74 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -19,6 +19,7 @@ #include "volumes.h" #include "space-info.h" #include "block-group.h" +#include "qgroup.h" struct btrfs_feature_attr { struct kobj_attribute kobj_attr; @@ -936,8 +937,12 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { + struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; + btrfs_reset_fs_info_ptr(fs_info); + sysfs_remove_link(fsid_kobj, "bdi"); + if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); @@ -957,8 +962,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) } #endif addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); - sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); + sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(fsid_kobj, btrfs_attrs); btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); } @@ -1273,7 +1278,9 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, { int error = 0; struct btrfs_device *dev; + unsigned int nofs_flag; + nofs_flag = memalloc_nofs_save(); list_for_each_entry(dev, &fs_devices->devices, dev_list) { if (one_device && one_device != dev) @@ -1301,6 +1308,7 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, break; } } + memalloc_nofs_restore(nofs_flag); return error; } @@ -1438,6 +1446,10 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) if (error) goto failure; + error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); + if (error) + goto failure; + fs_info->space_info_kobj = kobject_create_and_add("allocation", fsid_kobj); if (!fs_info->space_info_kobj) { @@ -1455,6 +1467,153 @@ failure: return error; } +static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) +{ + return to_fs_info(kobj->parent->parent); +} + +#define QGROUP_ATTR(_member, _show_name) \ +static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ + struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ + struct btrfs_qgroup, kobj); \ + return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \ +} \ +BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member) + +#define QGROUP_RSV_ATTR(_name, _type) \ +static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ + struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ + struct btrfs_qgroup, kobj); \ + return btrfs_show_u64(&qgroup->rsv.values[_type], \ + &fs_info->qgroup_lock, buf); \ +} \ +BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name) + +QGROUP_ATTR(rfer, referenced); +QGROUP_ATTR(excl, exclusive); +QGROUP_ATTR(max_rfer, max_referenced); +QGROUP_ATTR(max_excl, max_exclusive); +QGROUP_ATTR(lim_flags, limit_flags); +QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA); +QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS); +QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC); + +static struct attribute *qgroup_attrs[] = { + BTRFS_ATTR_PTR(qgroup, referenced), + BTRFS_ATTR_PTR(qgroup, exclusive), + BTRFS_ATTR_PTR(qgroup, max_referenced), + BTRFS_ATTR_PTR(qgroup, max_exclusive), + BTRFS_ATTR_PTR(qgroup, limit_flags), + BTRFS_ATTR_PTR(qgroup, rsv_data), + BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans), + BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc), + NULL +}; +ATTRIBUTE_GROUPS(qgroup); + +static void qgroup_release(struct kobject *kobj) +{ + struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj); + + memset(&qgroup->kobj, 0, sizeof(*kobj)); +} + +static struct kobj_type qgroup_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = qgroup_release, + .default_groups = qgroup_groups, +}; + +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + struct kobject *qgroups_kobj = fs_info->qgroups_kobj; + int ret; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return 0; + if (qgroup->kobj.state_initialized) + return 0; + if (!qgroups_kobj) + return -EINVAL; + + ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj, + "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid)); + if (ret < 0) + kobject_put(&qgroup->kobj); + + return ret; +} + +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *next; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return; + + rbtree_postorder_for_each_entry_safe(qgroup, next, + &fs_info->qgroup_tree, node) + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kobject_del(fs_info->qgroups_kobj); + kobject_put(fs_info->qgroups_kobj); + fs_info->qgroups_kobj = NULL; +} + +/* Called when qgroups get initialized, thus there is no need for locking */ +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) +{ + struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *next; + int ret = 0; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return 0; + + ASSERT(fsid_kobj); + if (fs_info->qgroups_kobj) + return 0; + + fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj); + if (!fs_info->qgroups_kobj) { + ret = -ENOMEM; + goto out; + } + rbtree_postorder_for_each_entry_safe(qgroup, next, + &fs_info->qgroup_tree, node) { + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + } + +out: + if (ret < 0) + btrfs_sysfs_del_qgroups(fs_info); + return ret; +} + +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return; + + if (qgroup->kobj.state_initialized) { + kobject_del(&qgroup->kobj); + kobject_put(&qgroup->kobj); + } +} /* * Change per-fs features in /sys/fs/btrfs/UUID/features to match current diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 718a26c97833..cf839c46a131 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -36,4 +36,11 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); void btrfs_sysfs_update_devid(struct btrfs_device *device); +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup); +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup); + #endif diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 914eea5ba6a7..2c783d2f5228 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -60,8 +60,6 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { - if (i >= num_extents) - goto invalid; if (i >= num_extents || extent_start != extents[i].start || offset - extent_start != extents[i].length) diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 24a8c714f56c..894a63a92236 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -954,8 +954,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ - ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, - NULL); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0, + BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -968,7 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, 0, NULL); if (ret) { @@ -999,7 +999,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, 0, NULL); @@ -1017,7 +1017,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] */ - ret = btrfs_set_extent_delalloc(inode, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, 0, NULL); @@ -1035,7 +1035,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] */ - ret = btrfs_set_extent_delalloc(inode, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { @@ -1069,7 +1069,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) * Refill the hole again just for good measure, because I thought it * might fail and I'd rather satisfy my paranoia at this point. */ - ret = btrfs_set_extent_delalloc(inode, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b359d4b17658..20c6ac1a5de7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -937,7 +937,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans) || test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { wake_up_process(info->transaction_kthread); - err = -EIO; + if (TRANS_ABORTED(trans)) + err = trans->aborted; + else + err = -EROFS; } kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -1630,7 +1633,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_get_fs_root(fs_info, objectid, true); + pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); btrfs_abort_transaction(trans, ret); @@ -2351,7 +2354,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); - clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index bf102e64bfb2..d60b055b8695 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -151,18 +151,20 @@ struct btrfs_pending_snapshot { struct btrfs_block_rsv block_rsv; /* extra metadata reservation for relocation */ int error; + /* Preallocated anonymous block device number */ + dev_t anon_dev; bool readonly; struct list_head list; }; static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->last_trans = trans->transaction->transid; - BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->last_trans = trans->transaction->transid; + inode->last_sub_trans = inode->root->log_transid; + inode->last_log_commit = inode->root->last_log_commit; + spin_unlock(&inode->lock); } /* @@ -208,20 +210,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); - -/* - * Try to commit transaction asynchronously, so this is safe to call - * even holding a spinlock. - * - * It's done by informing transaction_kthread to commit transaction without - * waiting for commit interval. - */ -static inline void btrfs_commit_transaction_locksafe( - struct btrfs_fs_info *fs_info) -{ - set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); - wake_up_process(fs_info->transaction_kthread); -} int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 16c3a6d2586d..d3f28b8f4ff9 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -133,10 +133,9 @@ out: ret = 0; } done: - if (ret != -EAGAIN) { + if (ret != -EAGAIN) memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - root->defrag_trans_start = trans->transid; - } + return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cd5348f352dd..ea8136dcf71f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3116,29 +3116,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); - atomic_inc(&log_root_tree->log_batch); - atomic_inc(&log_root_tree->log_writers); index2 = log_root_tree->log_transid % 2; list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; - mutex_unlock(&log_root_tree->log_mutex); - - mutex_lock(&log_root_tree->log_mutex); - /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit * open until we drop the log_mutex. */ ret = update_log_root(trans, log, &new_root_item); - - if (atomic_dec_and_test(&log_root_tree->log_writers)) { - /* atomic_dec_and_test implies a barrier */ - cond_wake_up_nomb(&log_root_tree->log_writer_wait); - } - if (ret) { if (!list_empty(&root_log_ctx.list)) list_del_init(&root_log_ctx.list); @@ -3184,8 +3172,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root_log_ctx.log_transid - 1); } - wait_for_writer(log_root_tree); - /* * now that we've moved on to the tree of log tree roots, * check the full commit flag again @@ -3906,6 +3892,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, } static int log_csums(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, struct btrfs_root *log_root, struct btrfs_ordered_sum *sums) { @@ -3914,6 +3901,14 @@ static int log_csums(struct btrfs_trans_handle *trans, int ret; /* + * If this inode was not used for reflink operations in the current + * transaction with new extents, then do the fast path, no need to + * worry about logging checksum items with overlapping ranges. + */ + if (inode->last_reflink_trans < trans->transid) + return btrfs_csum_file_blocks(trans, log_root, sums); + + /* * Serialize logging for checksums. This is to avoid racing with the * same checksum being logged by another task that is logging another * file which happens to refer to the same extent as well. Such races @@ -4064,7 +4059,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = log_csums(trans, log, sums); + ret = log_csums(trans, inode, log, sums); list_del(&sums->list); kfree(sums); } @@ -4123,7 +4118,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = log_csums(trans, log_root, sums); + ret = log_csums(trans, inode, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -4151,7 +4146,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, em->start + em->len, NULL, 0, 1, sizeof(*fi), &extent_inserted); if (ret) @@ -5123,14 +5118,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, const loff_t end, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_path *dst_path; struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; int err = 0; - int ret; + int ret = 0; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -5166,15 +5160,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.offset = (u64)-1; /* - * Only run delayed items if we are a dir or a new file. - * Otherwise commit the delayed inode only, which is needed in - * order for the log replay code to mark inodes for link count - * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + * Only run delayed items if we are a directory. We want to make sure + * all directory indexes hit the fs/subvolume tree so we can find them + * and figure out which index ranges have to be logged. + * + * Otherwise commit the delayed inode only if the full sync flag is set, + * as we want to make sure an up to date version is in the subvolume + * tree so copy_inode_items_to_log() / copy_items() can find it and copy + * it to the log tree. For a non full sync, we always log the inode item + * based on the in-memory struct btrfs_inode which is always up to date. */ - if (S_ISDIR(inode->vfs_inode.i_mode) || - inode->generation > fs_info->last_trans_committed) + if (S_ISDIR(inode->vfs_inode.i_mode)) ret = btrfs_commit_inode_delayed_items(trans, inode); - else + else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) ret = btrfs_commit_inode_delayed_inode(inode); if (ret) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f403fb1e6d37..d7670e2a9f39 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -245,7 +245,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * * global::fs_devs - add, remove, updates to the global list * - * does not protect: manipulation of the fs_devices::devices list! + * does not protect: manipulation of the fs_devices::devices list in general + * but in mount context it could be used to exclude list modifications by eg. + * scan ioctl * * btrfs_device::name - renames (write side), read is RCU * @@ -258,6 +260,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * + * Is not required at mount and close times, because our device list is + * protected by the uuid_mutex at that point. + * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from @@ -602,6 +607,11 @@ static int btrfs_free_stale_devices(const char *path, return ret; } +/* + * This is only used on mount, and we are protected from competing things + * messing with our fs_devices by the uuid_mutex, thus we do not need the + * fs_devices->device_list_mutex here. + */ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, fmode_t flags, void *holder) @@ -1229,8 +1239,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int ret; lockdep_assert_held(&uuid_mutex); + /* + * The device_list_mutex cannot be taken here in case opening the + * underlying device takes further locks like bd_mutex. + * + * We also don't need the lock here as this is called during mount and + * exclusion is provided by uuid_mutex + */ - mutex_lock(&fs_devices->device_list_mutex); if (fs_devices->opened) { fs_devices->opened++; ret = 0; @@ -1238,7 +1254,6 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, list_sort(NULL, &fs_devices->devices, devid_cmp); ret = open_fs_devices(fs_devices, flags, holder); } - mutex_unlock(&fs_devices->device_list_mutex); return ret; } @@ -3231,7 +3246,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); + trans = btrfs_start_transaction_fallback_global_rsv(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); @@ -4135,7 +4150,22 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, mutex_lock(&fs_info->balance_mutex); if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) btrfs_info(fs_info, "balance: paused"); - else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) + /* + * Balance can be canceled by: + * + * - Regular cancel request + * Then ret == -ECANCELED and balance_cancel_req > 0 + * + * - Fatal signal to "btrfs" process + * Either the signal caught by wait_reserve_ticket() and callers + * got -EINTR, or caught by btrfs_should_cancel_balance() and + * got -ECANCELED. + * Either way, in this case balance_cancel_req = 0, and + * ret == -EINTR or ret == -ECANCELED. + * + * So here we only check the return value to catch canceled balance. + */ + else if (ret == -ECANCELED || ret == -EINTR) btrfs_info(fs_info, "balance: canceled"); else btrfs_info(fs_info, "balance: ended with status: %d", ret); @@ -5522,6 +5552,9 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) atomic_set(&bbio->error, 0); refcount_set(&bbio->refs, 1); + bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); + bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); + return bbio; } @@ -6144,8 +6177,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto out; } - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + stripe_index++; + } /* build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && @@ -6153,11 +6191,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 tmp; unsigned rot; - bbio->raid_map = (u64 *)((void *)bbio->stripes + - sizeof(struct btrfs_bio_stripe) * - num_alloc_stripes + - sizeof(int) * tgtdev_indexes); - /* Work out the disk rotation on this stripe-set */ div_u64_rem(stripe_nr, num_stripes, &rot); @@ -6171,25 +6204,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (map->type & BTRFS_BLOCK_GROUP_RAID6) bbio->raid_map[(i+rot+1) % num_stripes] = RAID6_Q_STRIPE; - } - - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + - stripe_nr * map->stripe_len; - bbio->stripes[i].dev = - map->stripes[stripe_index].dev; - stripe_index++; + sort_parity_stripes(bbio, num_stripes); } if (need_full_stripe(op)) max_errors = btrfs_chunk_max_errors(map); - if (bbio->raid_map) - sort_parity_stripes(bbio, num_stripes); - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, @@ -6261,23 +6282,18 @@ static void btrfs_end_bio(struct bio *bio) atomic_inc(&bbio->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - unsigned int stripe_index = - btrfs_io_bio(bio)->stripe_index; - struct btrfs_device *dev; - - BUG_ON(stripe_index >= bbio->num_stripes); - dev = bbio->stripes[stripe_index].dev; - if (dev->bdev) { - if (bio_op(bio) == REQ_OP_WRITE) - btrfs_dev_stat_inc_and_print(dev, + struct btrfs_device *dev = btrfs_io_bio(bio)->device; + + ASSERT(dev->bdev); + if (bio_op(bio) == REQ_OP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, + else if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); - } } } @@ -6313,13 +6329,12 @@ static void btrfs_end_bio(struct bio *bio) } static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, - u64 physical, int dev_nr) + u64 physical, struct btrfs_device *dev) { - struct btrfs_device *dev = bbio->stripes[dev_nr].dev; struct btrfs_fs_info *fs_info = bbio->fs_info; bio->bi_private = bbio; - btrfs_io_bio(bio)->stripe_index = dev_nr; + btrfs_io_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; btrfs_debug_in_rcu(fs_info, @@ -6420,8 +6435,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, else bio = first_bio; - submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, - dev_nr); + submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); } btrfs_bio_counter_dec(fs_info); return BLK_STS_OK; @@ -7029,6 +7043,19 @@ out: return ret; } +static void readahead_tree_node_children(struct extent_buffer *node) +{ + int i; + const int nr_items = btrfs_header_nritems(node); + + for (i = 0; i < nr_items; i++) { + u64 start; + + start = btrfs_node_blockptr(node, i); + readahead_tree_block(node->fs_info, start); + } +} + int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->chunk_root; @@ -7039,6 +7066,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) int ret; int slot; u64 total_dev = 0; + u64 last_ra_node = 0; path = btrfs_alloc_path(); if (!path) @@ -7049,7 +7077,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * otherwise we don't need it. */ mutex_lock(&uuid_mutex); - mutex_lock(&fs_info->chunk_mutex); /* * It is possible for mount and umount to race in such a way that @@ -7072,6 +7099,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) if (ret < 0) goto error; while (1) { + struct extent_buffer *node; + leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { @@ -7082,6 +7111,17 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) goto error; break; } + /* + * The nodes on level 1 are not locked but we don't need to do + * that during mount time as nothing else can access the tree + */ + node = path->nodes[1]; + if (node) { + if (last_ra_node != node->start) { + readahead_tree_node_children(node); + last_ra_node = node->start; + } + } btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_DEV_ITEM_KEY) { struct btrfs_dev_item *dev_item; @@ -7094,7 +7134,9 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + mutex_lock(&fs_info->chunk_mutex); ret = read_one_chunk(&found_key, leaf, chunk); + mutex_unlock(&fs_info->chunk_mutex); if (ret) goto error; } @@ -7124,7 +7166,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) } ret = 0; error: - mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&uuid_mutex); btrfs_free_path(path); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 75af2334b2e3..5eea93916fbf 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -288,7 +288,7 @@ struct btrfs_fs_devices { */ struct btrfs_io_bio { unsigned int mirror_num; - unsigned int stripe_index; + struct btrfs_device *device; u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; diff --git a/fs/buffer.c b/fs/buffer.c index 64fe82ec65ff..061dd202979d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -320,9 +320,8 @@ static void decrypt_bh(struct work_struct *work) static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) { /* Decrypt if needed */ - if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) && - IS_ENCRYPTED(bh->b_page->mapping->host) && - S_ISREG(bh->b_page->mapping->host->i_mode)) { + if (uptodate && + fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) { struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); if (ctx) { @@ -3040,12 +3039,10 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE)) clear_buffer_write_io_error(bh); - /* - * from here on down, it's all bio -- do the initial mapping, - * submit_bio -> generic_make_request may further map this bio around - */ bio = bio_alloc(GFP_NOIO, 1); + fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 8046d7c7a3e9..a5f5c30368a2 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -4,6 +4,7 @@ config FS_ENCRYPTION select CRYPTO select CRYPTO_HASH select CRYPTO_SKCIPHER + select CRYPTO_LIB_SHA256 select KEYS help Enable encryption of files and directories. This @@ -21,6 +22,11 @@ config FS_ENCRYPTION_ALGS select CRYPTO_CTS select CRYPTO_ECB select CRYPTO_HMAC - select CRYPTO_SHA256 select CRYPTO_SHA512 select CRYPTO_XTS + +config FS_ENCRYPTION_INLINE_CRYPT + bool "Enable fscrypt to use inline crypto" + depends on FS_ENCRYPTION && BLK_INLINE_ENCRYPTION + help + Enable fscrypt to use inline encryption hardware if available. diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index 232e2bb5a337..652c7180ec6d 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -11,3 +11,4 @@ fscrypto-y := crypto.o \ policy.o fscrypto-$(CONFIG_BLOCK) += bio.o +fscrypto-$(CONFIG_FS_ENCRYPTION_INLINE_CRYPT) += inline_crypt.o diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 4fa18fff9c4e..b048a0e38516 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -41,6 +41,53 @@ void fscrypt_decrypt_bio(struct bio *bio) } EXPORT_SYMBOL(fscrypt_decrypt_bio); +static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, + pgoff_t lblk, sector_t pblk, + unsigned int len) +{ + const unsigned int blockbits = inode->i_blkbits; + const unsigned int blocks_per_page = 1 << (PAGE_SHIFT - blockbits); + struct bio *bio; + int ret, err = 0; + int num_pages = 0; + + /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ + bio = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + + while (len) { + unsigned int blocks_this_page = min(len, blocks_per_page); + unsigned int bytes_this_page = blocks_this_page << blockbits; + + if (num_pages == 0) { + fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); + bio_set_dev(bio, inode->i_sb->s_bdev); + bio->bi_iter.bi_sector = + pblk << (blockbits - SECTOR_SHIFT); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + } + ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); + if (WARN_ON(ret != bytes_this_page)) { + err = -EIO; + goto out; + } + num_pages++; + len -= blocks_this_page; + lblk += blocks_this_page; + pblk += blocks_this_page; + if (num_pages == BIO_MAX_PAGES || !len || + !fscrypt_mergeable_bio(bio, inode, lblk)) { + err = submit_bio_wait(bio); + if (err) + goto out; + bio_reset(bio); + num_pages = 0; + } + } +out: + bio_put(bio); + return err; +} + /** * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file * @inode: the file's inode @@ -75,6 +122,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, if (len == 0) return 0; + if (fscrypt_inode_uses_inline_crypto(inode)) + return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk, + len); + BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES); nr_pages = min_t(unsigned int, ARRAY_SIZE(pages), (len + blocks_per_page - 1) >> blocks_per_page_bits); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index ed015cb66c7c..9212325763b0 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -84,7 +84,7 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, WARN_ON_ONCE(lblk_num > U32_MAX); lblk_num = (u32)(ci->ci_hashed_ino + lblk_num); } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { - memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); + memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE); } iv->lblk_num = cpu_to_le64(lblk_num); } @@ -100,7 +100,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; int res = 0; if (WARN_ON_ONCE(len <= 0)) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 83ca5f1e7934..011830f84d8d 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -61,30 +61,13 @@ struct fscrypt_nokey_name { */ #define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) -static struct crypto_shash *sha256_hash_tfm; - -static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) +static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) { - struct crypto_shash *tfm = READ_ONCE(sha256_hash_tfm); - - if (unlikely(!tfm)) { - struct crypto_shash *prev_tfm; - - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - fscrypt_err(NULL, - "Error allocating SHA-256 transform: %ld", - PTR_ERR(tfm)); - return PTR_ERR(tfm); - } - prev_tfm = cmpxchg(&sha256_hash_tfm, NULL, tfm); - if (prev_tfm) { - crypto_free_shash(tfm); - tfm = prev_tfm; - } - } + struct sha256_state sctx; - return crypto_shash_tfm_digest(tfm, data, data_len, result); + sha256_init(&sctx); + sha256_update(&sctx, data, data_len); + sha256_final(&sctx, result); } static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) @@ -115,7 +98,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); const struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; struct scatterlist sg; int res; @@ -171,7 +154,7 @@ static int fname_decrypt(const struct inode *inode, DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; const struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; int res; @@ -349,7 +332,6 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, const struct qstr qname = FSTR_TO_QSTR(iname); struct fscrypt_nokey_name nokey_name; u32 size; /* size of the unencoded no-key name */ - int err; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -387,11 +369,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, } else { memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes)); /* Compute strong hash of remaining part of name. */ - err = fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)], - iname->len - sizeof(nokey_name.bytes), - nokey_name.sha256); - if (err) - return err; + fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)], + iname->len - sizeof(nokey_name.bytes), + nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name); @@ -530,9 +510,8 @@ bool fscrypt_match_name(const struct fscrypt_name *fname, return false; if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes))) return false; - if (fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)], - de_name_len - sizeof(nokey_name->bytes), sha256)) - return false; + fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)], + de_name_len - sizeof(nokey_name->bytes), sha256); return !memcmp(sha256, nokey_name->sha256, sizeof(sha256)); } EXPORT_SYMBOL_GPL(fscrypt_match_name); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index eb7fcd2b7fb8..8117a61b6f55 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -14,12 +14,13 @@ #include <linux/fscrypt.h> #include <linux/siphash.h> #include <crypto/hash.h> +#include <linux/blk-crypto.h> #define CONST_STRLEN(str) (sizeof(str) - 1) -#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FSCRYPT_FILE_NONCE_SIZE 16 -#define FSCRYPT_MIN_KEY_SIZE 16 +#define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 @@ -30,7 +31,7 @@ struct fscrypt_context_v1 { u8 filenames_encryption_mode; u8 flags; u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; - u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; struct fscrypt_context_v2 { @@ -40,7 +41,7 @@ struct fscrypt_context_v2 { u8 flags; u8 __reserved[4]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; - u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; /* @@ -166,6 +167,20 @@ struct fscrypt_symlink_data { char encrypted_path[1]; } __packed; +/** + * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption + * @tfm: crypto API transform object + * @blk_key: key for blk-crypto + * + * Normally only one of the fields will be non-NULL. + */ +struct fscrypt_prepared_key { + struct crypto_skcipher *tfm; +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + struct fscrypt_blk_crypto_key *blk_key; +#endif +}; + /* * fscrypt_info - the "encryption key" for an inode * @@ -175,12 +190,20 @@ struct fscrypt_symlink_data { */ struct fscrypt_info { - /* The actual crypto transform used for encryption and decryption */ - struct crypto_skcipher *ci_ctfm; + /* The key in a form prepared for actual encryption/decryption */ + struct fscrypt_prepared_key ci_enc_key; - /* True if the key should be freed when this fscrypt_info is freed */ + /* True if ci_enc_key should be freed when this fscrypt_info is freed */ bool ci_owns_key; +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + /* + * True if this inode will use inline encryption (blk-crypto) instead of + * the traditional filesystem-layer encryption. + */ + bool ci_inlinecrypt; +#endif + /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. @@ -205,7 +228,7 @@ struct fscrypt_info { /* * If non-NULL, then encryption is done using the master key directly - * and ci_ctfm will equal ci_direct_key->dk_ctfm. + * and ci_enc_key will equal ci_direct_key->dk_key. */ struct fscrypt_direct_key *ci_direct_key; @@ -221,7 +244,7 @@ struct fscrypt_info { union fscrypt_policy ci_policy; /* This inode's nonce, copied from the fscrypt_context */ - u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE]; /* Hashed inode number. Only set for IV_INO_LBLK_32 */ u32 ci_hashed_ino; @@ -257,9 +280,10 @@ union fscrypt_iv { __le64 lblk_num; /* per-file nonce; only set in DIRECT_KEY mode */ - u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; u8 raw[FSCRYPT_MAX_IV_SIZE]; + __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)]; }; void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, @@ -288,13 +312,13 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, * outputs are unique and cryptographically isolated, i.e. knowledge of one * output doesn't reveal another. */ -#define HKDF_CONTEXT_KEY_IDENTIFIER 1 -#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 -#define HKDF_CONTEXT_DIRECT_KEY 3 -#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 -#define HKDF_CONTEXT_DIRHASH_KEY 5 -#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 -#define HKDF_CONTEXT_INODE_HASH_KEY 7 +#define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */ +#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */ +#define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */ +#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */ +#define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */ +#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */ +#define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, @@ -302,6 +326,78 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); +/* inline_crypt.c */ +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT +int fscrypt_select_encryption_impl(struct fscrypt_info *ci); + +static inline bool +fscrypt_using_inline_encryption(const struct fscrypt_info *ci) +{ + return ci->ci_inlinecrypt; +} + +int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, + const u8 *raw_key, + const struct fscrypt_info *ci); + +void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key); + +/* + * Check whether the crypto transform or blk-crypto key has been allocated in + * @prep_key, depending on which encryption implementation the file will use. + */ +static inline bool +fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, + const struct fscrypt_info *ci) +{ + /* + * The two smp_load_acquire()'s here pair with the smp_store_release()'s + * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key(). + * I.e., in some cases (namely, if this prep_key is a per-mode + * encryption key) another task can publish blk_key or tfm concurrently, + * executing a RELEASE barrier. We need to use smp_load_acquire() here + * to safely ACQUIRE the memory the other task published. + */ + if (fscrypt_using_inline_encryption(ci)) + return smp_load_acquire(&prep_key->blk_key) != NULL; + return smp_load_acquire(&prep_key->tfm) != NULL; +} + +#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ + +static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci) +{ + return 0; +} + +static inline bool +fscrypt_using_inline_encryption(const struct fscrypt_info *ci) +{ + return false; +} + +static inline int +fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, + const u8 *raw_key, + const struct fscrypt_info *ci) +{ + WARN_ON(1); + return -EOPNOTSUPP; +} + +static inline void +fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +{ +} + +static inline bool +fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, + const struct fscrypt_info *ci) +{ + return smp_load_acquire(&prep_key->tfm) != NULL; +} +#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ + /* keyring.c */ /* @@ -395,9 +491,9 @@ struct fscrypt_master_key { * Per-mode encryption keys for the various types of encryption policies * that use them. Allocated and derived on-demand. */ - struct crypto_skcipher *mk_direct_keys[__FSCRYPT_MODE_MAX + 1]; - struct crypto_skcipher *mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1]; - struct crypto_skcipher *mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1]; + struct fscrypt_prepared_key mk_direct_keys[__FSCRYPT_MODE_MAX + 1]; + struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1]; + struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1]; /* Hash key for inode numbers. Initialized only when needed. */ siphash_key_t mk_ino_hash_key; @@ -461,13 +557,15 @@ struct fscrypt_mode { int keysize; int ivsize; int logged_impl_name; + enum blk_crypto_mode_num blk_crypto_mode; }; extern struct fscrypt_mode fscrypt_modes[]; -struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, - const u8 *raw_key, - const struct inode *inode); +int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, + const u8 *raw_key, const struct fscrypt_info *ci); + +void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c new file mode 100644 index 000000000000..b6b8574caa13 --- /dev/null +++ b/fs/crypto/inline_crypt.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Inline encryption support for fscrypt + * + * Copyright 2019 Google LLC + */ + +/* + * With "inline encryption", the block layer handles the decryption/encryption + * as part of the bio, instead of the filesystem doing the crypto itself via + * crypto API. See Documentation/block/inline-encryption.rst. fscrypt still + * provides the key and IV to use. + */ + +#include <linux/blk-crypto.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/sched/mm.h> + +#include "fscrypt_private.h" + +struct fscrypt_blk_crypto_key { + struct blk_crypto_key base; + int num_devs; + struct request_queue *devs[]; +}; + +static int fscrypt_get_num_devices(struct super_block *sb) +{ + if (sb->s_cop->get_num_devices) + return sb->s_cop->get_num_devices(sb); + return 1; +} + +static void fscrypt_get_devices(struct super_block *sb, int num_devs, + struct request_queue **devs) +{ + if (num_devs == 1) + devs[0] = bdev_get_queue(sb->s_bdev); + else + sb->s_cop->get_devices(sb, devs); +} + +static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) +{ + struct super_block *sb = ci->ci_inode->i_sb; + unsigned int flags = fscrypt_policy_flags(&ci->ci_policy); + int ino_bits = 64, lblk_bits = 64; + + if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) + return offsetofend(union fscrypt_iv, nonce); + + if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) + return sizeof(__le64); + + if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) + return sizeof(__le32); + + /* Default case: IVs are just the file logical block number */ + if (sb->s_cop->get_ino_and_lblk_bits) + sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); + return DIV_ROUND_UP(lblk_bits, 8); +} + +/* Enable inline encryption for this file if supported. */ +int fscrypt_select_encryption_impl(struct fscrypt_info *ci) +{ + const struct inode *inode = ci->ci_inode; + struct super_block *sb = inode->i_sb; + struct blk_crypto_config crypto_cfg; + int num_devs; + struct request_queue **devs; + int i; + + /* The file must need contents encryption, not filenames encryption */ + if (!fscrypt_needs_contents_encryption(inode)) + return 0; + + /* The crypto mode must have a blk-crypto counterpart */ + if (ci->ci_mode->blk_crypto_mode == BLK_ENCRYPTION_MODE_INVALID) + return 0; + + /* The filesystem must be mounted with -o inlinecrypt */ + if (!(sb->s_flags & SB_INLINECRYPT)) + return 0; + + /* + * When a page contains multiple logically contiguous filesystem blocks, + * some filesystem code only calls fscrypt_mergeable_bio() for the first + * block in the page. This is fine for most of fscrypt's IV generation + * strategies, where contiguous blocks imply contiguous IVs. But it + * doesn't work with IV_INO_LBLK_32. For now, simply exclude + * IV_INO_LBLK_32 with blocksize != PAGE_SIZE from inline encryption. + */ + if ((fscrypt_policy_flags(&ci->ci_policy) & + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && + sb->s_blocksize != PAGE_SIZE) + return 0; + + /* + * On all the filesystem's devices, blk-crypto must support the crypto + * configuration that the file would use. + */ + crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; + crypto_cfg.data_unit_size = sb->s_blocksize; + crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); + num_devs = fscrypt_get_num_devices(sb); + devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS); + if (!devs) + return -ENOMEM; + fscrypt_get_devices(sb, num_devs, devs); + + for (i = 0; i < num_devs; i++) { + if (!blk_crypto_config_supported(devs[i], &crypto_cfg)) + goto out_free_devs; + } + + ci->ci_inlinecrypt = true; +out_free_devs: + kfree(devs); + + return 0; +} + +int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, + const u8 *raw_key, + const struct fscrypt_info *ci) +{ + const struct inode *inode = ci->ci_inode; + struct super_block *sb = inode->i_sb; + enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode; + int num_devs = fscrypt_get_num_devices(sb); + int queue_refs = 0; + struct fscrypt_blk_crypto_key *blk_key; + int err; + int i; + unsigned int flags; + + blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS); + if (!blk_key) + return -ENOMEM; + + blk_key->num_devs = num_devs; + fscrypt_get_devices(sb, num_devs, blk_key->devs); + + err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode, + fscrypt_get_dun_bytes(ci), sb->s_blocksize); + if (err) { + fscrypt_err(inode, "error %d initializing blk-crypto key", err); + goto fail; + } + + /* + * We have to start using blk-crypto on all the filesystem's devices. + * We also have to save all the request_queue's for later so that the + * key can be evicted from them. This is needed because some keys + * aren't destroyed until after the filesystem was already unmounted + * (namely, the per-mode keys in struct fscrypt_master_key). + */ + for (i = 0; i < num_devs; i++) { + if (!blk_get_queue(blk_key->devs[i])) { + fscrypt_err(inode, "couldn't get request_queue"); + err = -EAGAIN; + goto fail; + } + queue_refs++; + + flags = memalloc_nofs_save(); + err = blk_crypto_start_using_key(&blk_key->base, + blk_key->devs[i]); + memalloc_nofs_restore(flags); + if (err) { + fscrypt_err(inode, + "error %d starting to use blk-crypto", err); + goto fail; + } + } + /* + * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). + * I.e., here we publish ->blk_key with a RELEASE barrier so that + * concurrent tasks can ACQUIRE it. Note that this concurrency is only + * possible for per-mode keys, not for per-file keys. + */ + smp_store_release(&prep_key->blk_key, blk_key); + return 0; + +fail: + for (i = 0; i < queue_refs; i++) + blk_put_queue(blk_key->devs[i]); + kzfree(blk_key); + return err; +} + +void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +{ + struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key; + int i; + + if (blk_key) { + for (i = 0; i < blk_key->num_devs; i++) { + blk_crypto_evict_key(blk_key->devs[i], &blk_key->base); + blk_put_queue(blk_key->devs[i]); + } + kzfree(blk_key); + } +} + +bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) +{ + return inode->i_crypt_info->ci_inlinecrypt; +} +EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); + +static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num, + u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]) +{ + union fscrypt_iv iv; + int i; + + fscrypt_generate_iv(&iv, lblk_num, ci); + + BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE); + memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE); + for (i = 0; i < ci->ci_mode->ivsize/sizeof(dun[0]); i++) + dun[i] = le64_to_cpu(iv.dun[i]); +} + +/** + * fscrypt_set_bio_crypt_ctx() - prepare a file contents bio for inline crypto + * @bio: a bio which will eventually be submitted to the file + * @inode: the file's inode + * @first_lblk: the first file logical block number in the I/O + * @gfp_mask: memory allocation flags - these must be a waiting mask so that + * bio_crypt_set_ctx can't fail. + * + * If the contents of the file should be encrypted (or decrypted) with inline + * encryption, then assign the appropriate encryption context to the bio. + * + * Normally the bio should be newly allocated (i.e. no pages added yet), as + * otherwise fscrypt_mergeable_bio() won't work as intended. + * + * The encryption context will be freed automatically when the bio is freed. + */ +void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, + u64 first_lblk, gfp_t gfp_mask) +{ + const struct fscrypt_info *ci; + u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + + if (!fscrypt_inode_uses_inline_crypto(inode)) + return; + ci = inode->i_crypt_info; + + fscrypt_generate_dun(ci, first_lblk, dun); + bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask); +} +EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); + +/* Extract the inode and logical block number from a buffer_head. */ +static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh, + const struct inode **inode_ret, + u64 *lblk_num_ret) +{ + struct page *page = bh->b_page; + const struct address_space *mapping; + const struct inode *inode; + + /* + * The ext4 journal (jbd2) can submit a buffer_head it directly created + * for a non-pagecache page. fscrypt doesn't care about these. + */ + mapping = page_mapping(page); + if (!mapping) + return false; + inode = mapping->host; + + *inode_ret = inode; + *lblk_num_ret = ((u64)page->index << (PAGE_SHIFT - inode->i_blkbits)) + + (bh_offset(bh) >> inode->i_blkbits); + return true; +} + +/** + * fscrypt_set_bio_crypt_ctx_bh() - prepare a file contents bio for inline + * crypto + * @bio: a bio which will eventually be submitted to the file + * @first_bh: the first buffer_head for which I/O will be submitted + * @gfp_mask: memory allocation flags + * + * Same as fscrypt_set_bio_crypt_ctx(), except this takes a buffer_head instead + * of an inode and block number directly. + */ +void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, + const struct buffer_head *first_bh, + gfp_t gfp_mask) +{ + const struct inode *inode; + u64 first_lblk; + + if (bh_get_inode_and_lblk_num(first_bh, &inode, &first_lblk)) + fscrypt_set_bio_crypt_ctx(bio, inode, first_lblk, gfp_mask); +} +EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh); + +/** + * fscrypt_mergeable_bio() - test whether data can be added to a bio + * @bio: the bio being built up + * @inode: the inode for the next part of the I/O + * @next_lblk: the next file logical block number in the I/O + * + * When building a bio which may contain data which should undergo inline + * encryption (or decryption) via fscrypt, filesystems should call this function + * to ensure that the resulting bio contains only contiguous data unit numbers. + * This will return false if the next part of the I/O cannot be merged with the + * bio because either the encryption key would be different or the encryption + * data unit numbers would be discontiguous. + * + * fscrypt_set_bio_crypt_ctx() must have already been called on the bio. + * + * Return: true iff the I/O is mergeable + */ +bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, + u64 next_lblk) +{ + const struct bio_crypt_ctx *bc = bio->bi_crypt_context; + u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + + if (!!bc != fscrypt_inode_uses_inline_crypto(inode)) + return false; + if (!bc) + return true; + + /* + * Comparing the key pointers is good enough, as all I/O for each key + * uses the same pointer. I.e., there's currently no need to support + * merging requests where the keys are the same but the pointers differ. + */ + if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base) + return false; + + fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); + return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); +} +EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); + +/** + * fscrypt_mergeable_bio_bh() - test whether data can be added to a bio + * @bio: the bio being built up + * @next_bh: the next buffer_head for which I/O will be submitted + * + * Same as fscrypt_mergeable_bio(), except this takes a buffer_head instead of + * an inode and block number directly. + * + * Return: true iff the I/O is mergeable + */ +bool fscrypt_mergeable_bio_bh(struct bio *bio, + const struct buffer_head *next_bh) +{ + const struct inode *inode; + u64 next_lblk; + + if (!bh_get_inode_and_lblk_num(next_bh, &inode, &next_lblk)) + return !bio->bi_crypt_context; + + return fscrypt_mergeable_bio(bio, inode, next_lblk); +} +EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index e24eb48bfbe1..71d56f8e2870 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -45,9 +45,9 @@ static void free_master_key(struct fscrypt_master_key *mk) wipe_master_key_secret(&mk->mk_secret); for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) { - crypto_free_skcipher(mk->mk_direct_keys[i]); - crypto_free_skcipher(mk->mk_iv_ino_lblk_64_keys[i]); - crypto_free_skcipher(mk->mk_iv_ino_lblk_32_keys[i]); + fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]); + fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]); + fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]); } key_put(mk->mk_users); @@ -213,7 +213,11 @@ static int allocate_filesystem_keyring(struct super_block *sb) if (IS_ERR(keyring)) return PTR_ERR(keyring); - /* Pairs with READ_ONCE() in fscrypt_find_master_key() */ + /* + * Pairs with the smp_load_acquire() in fscrypt_find_master_key(). + * I.e., here we publish ->s_master_keys with a RELEASE barrier so that + * concurrent tasks can ACQUIRE it. + */ smp_store_release(&sb->s_master_keys, keyring); return 0; } @@ -234,8 +238,13 @@ struct key *fscrypt_find_master_key(struct super_block *sb, struct key *keyring; char description[FSCRYPT_MK_DESCRIPTION_SIZE]; - /* pairs with smp_store_release() in allocate_filesystem_keyring() */ - keyring = READ_ONCE(sb->s_master_keys); + /* + * Pairs with the smp_store_release() in allocate_filesystem_keyring(). + * I.e., another task can publish ->s_master_keys concurrently, + * executing a RELEASE barrier. We need to use smp_load_acquire() here + * to safely ACQUIRE the memory the other task published. + */ + keyring = smp_load_acquire(&sb->s_master_keys); if (keyring == NULL) return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */ diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 1129adfa097d..fea6226afc2b 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = { .cipher_str = "xts(aes)", .keysize = 64, .ivsize = 16, + .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS, }, [FSCRYPT_MODE_AES_256_CTS] = { .friendly_name = "AES-256-CTS-CBC", @@ -31,6 +32,7 @@ struct fscrypt_mode fscrypt_modes[] = { .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .ivsize = 16, + .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, }, [FSCRYPT_MODE_AES_128_CTS] = { .friendly_name = "AES-128-CTS-CBC", @@ -43,6 +45,7 @@ struct fscrypt_mode fscrypt_modes[] = { .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .ivsize = 32, + .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, }; @@ -64,9 +67,9 @@ select_encryption_mode(const union fscrypt_policy *policy, } /* Create a symmetric cipher object for the given encryption mode and key */ -struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, - const u8 *raw_key, - const struct inode *inode) +static struct crypto_skcipher * +fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, + const struct inode *inode) { struct crypto_skcipher *tfm; int err; @@ -109,30 +112,56 @@ err_free_tfm: return ERR_PTR(err); } -/* Given a per-file encryption key, set up the file's crypto transform object */ -int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key) +/* + * Prepare the crypto transform object or blk-crypto key in @prep_key, given the + * raw key, encryption mode, and flag indicating which encryption implementation + * (fs-layer or blk-crypto) will be used. + */ +int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, + const u8 *raw_key, const struct fscrypt_info *ci) { struct crypto_skcipher *tfm; + if (fscrypt_using_inline_encryption(ci)) + return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci); + tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode); if (IS_ERR(tfm)) return PTR_ERR(tfm); + /* + * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). + * I.e., here we publish ->tfm with a RELEASE barrier so that + * concurrent tasks can ACQUIRE it. Note that this concurrency is only + * possible for per-mode keys, not for per-file keys. + */ + smp_store_release(&prep_key->tfm, tfm); + return 0; +} - ci->ci_ctfm = tfm; +/* Destroy a crypto transform object and/or blk-crypto key. */ +void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key) +{ + crypto_free_skcipher(prep_key->tfm); + fscrypt_destroy_inline_crypt_key(prep_key); +} + +/* Given a per-file encryption key, set up the file's crypto transform object */ +int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key) +{ ci->ci_owns_key = true; - return 0; + return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci); } static int setup_per_mode_enc_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk, - struct crypto_skcipher **tfms, + struct fscrypt_prepared_key *keys, u8 hkdf_context, bool include_fs_uuid) { const struct inode *inode = ci->ci_inode; const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; - struct crypto_skcipher *tfm; + struct fscrypt_prepared_key *prep_key; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; @@ -141,16 +170,15 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX)) return -EINVAL; - /* pairs with smp_store_release() below */ - tfm = READ_ONCE(tfms[mode_num]); - if (likely(tfm != NULL)) { - ci->ci_ctfm = tfm; + prep_key = &keys[mode_num]; + if (fscrypt_is_key_prepared(prep_key, ci)) { + ci->ci_enc_key = *prep_key; return 0; } mutex_lock(&fscrypt_mode_key_setup_mutex); - if (tfms[mode_num]) + if (fscrypt_is_key_prepared(prep_key, ci)) goto done_unlock; BUILD_BUG_ON(sizeof(mode_num) != 1); @@ -167,16 +195,12 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, mode_key, mode->keysize); if (err) goto out_unlock; - tfm = fscrypt_allocate_skcipher(mode, mode_key, inode); + err = fscrypt_prepare_key(prep_key, mode_key, ci); memzero_explicit(mode_key, mode->keysize); - if (IS_ERR(tfm)) { - err = PTR_ERR(tfm); + if (err) goto out_unlock; - } - /* pairs with READ_ONCE() above */ - smp_store_release(&tfms[mode_num], tfm); done_unlock: - ci->ci_ctfm = tfm; + ci->ci_enc_key = *prep_key; err = 0; out_unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); @@ -189,7 +213,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, int err; err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY, - ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, (u8 *)&ci->ci_dirhash_key, sizeof(ci->ci_dirhash_key)); if (err) @@ -270,8 +294,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_PER_FILE_ENC_KEY, - ci->ci_nonce, - FS_KEY_DERIVATION_NONCE_SIZE, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, derived_key, ci->ci_mode->keysize); if (err) return err; @@ -310,6 +333,10 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, struct fscrypt_key_specifier mk_spec; int err; + err = fscrypt_select_encryption_impl(ci); + if (err) + return err; + switch (ci->ci_policy.version) { case FSCRYPT_POLICY_V1: mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; @@ -402,7 +429,7 @@ static void put_crypt_info(struct fscrypt_info *ci) if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); else if (ci->ci_owns_key) - crypto_free_skcipher(ci->ci_ctfm); + fscrypt_destroy_prepared_key(&ci->ci_enc_key); key = ci->ci_master_key; if (key) { @@ -472,7 +499,7 @@ int fscrypt_get_encryption_info(struct inode *inode) } memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx), - FS_KEY_DERIVATION_NONCE_SIZE); + FSCRYPT_FILE_NONCE_SIZE); if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) { res = -EINVAL; @@ -491,7 +518,17 @@ int fscrypt_get_encryption_info(struct inode *inode) if (res) goto out; + /* + * Multiple tasks may race to set ->i_crypt_info, so use + * cmpxchg_release(). This pairs with the smp_load_acquire() in + * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a + * RELEASE barrier so that other tasks can ACQUIRE it. + */ if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + /* + * We won the race and set ->i_crypt_info to our crypt_info. + * Now link it into the master key's inode list. + */ if (master_key) { struct fscrypt_master_key *mk = master_key->payload.data[0]; @@ -562,7 +599,7 @@ EXPORT_SYMBOL(fscrypt_free_inode); */ int fscrypt_drop_inode(struct inode *inode) { - const struct fscrypt_info *ci = READ_ONCE(inode->i_crypt_info); + const struct fscrypt_info *ci = fscrypt_get_info(inode); const struct fscrypt_master_key *mk; /* diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index 801b48c0cd7f..e4e707fb1100 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -45,7 +45,7 @@ static DEFINE_SPINLOCK(fscrypt_direct_keys_lock); * key is longer, then only the first 'derived_keysize' bytes are used. */ static int derive_key_aes(const u8 *master_key, - const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE], + const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], u8 *derived_key, unsigned int derived_keysize) { int res = 0; @@ -68,7 +68,7 @@ static int derive_key_aes(const u8 *master_key, skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); - res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE); + res = crypto_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE); if (res < 0) goto out; @@ -146,7 +146,7 @@ struct fscrypt_direct_key { struct hlist_node dk_node; refcount_t dk_refcount; const struct fscrypt_mode *dk_mode; - struct crypto_skcipher *dk_ctfm; + struct fscrypt_prepared_key dk_key; u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 dk_raw[FSCRYPT_MAX_KEY_SIZE]; }; @@ -154,7 +154,7 @@ struct fscrypt_direct_key { static void free_direct_key(struct fscrypt_direct_key *dk) { if (dk) { - crypto_free_skcipher(dk->dk_ctfm); + fscrypt_destroy_prepared_key(&dk->dk_key); kzfree(dk); } } @@ -199,6 +199,8 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, continue; if (ci->ci_mode != dk->dk_mode) continue; + if (!fscrypt_is_key_prepared(&dk->dk_key, ci)) + continue; if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize)) continue; /* using existing tfm with same (descriptor, mode, raw_key) */ @@ -231,13 +233,9 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) return ERR_PTR(-ENOMEM); refcount_set(&dk->dk_refcount, 1); dk->dk_mode = ci->ci_mode; - dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, - ci->ci_inode); - if (IS_ERR(dk->dk_ctfm)) { - err = PTR_ERR(dk->dk_ctfm); - dk->dk_ctfm = NULL; + err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci); + if (err) goto err_free_dk; - } memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize); @@ -259,7 +257,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci, if (IS_ERR(dk)) return PTR_ERR(dk); ci->ci_direct_key = dk; - ci->ci_ctfm = dk->dk_ctfm; + ci->ci_enc_key = dk->dk_key; return 0; } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index d23ff162c78b..2d73fd39ad96 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -78,6 +78,20 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, int ino_bits = 64, lblk_bits = 64; /* + * IV_INO_LBLK_* exist only because of hardware limitations, and + * currently the only known use case for them involves AES-256-XTS. + * That's also all we test currently. For these reasons, for now only + * allow AES-256-XTS here. This can be relaxed later if a use case for + * IV_INO_LBLK_* with other encryption modes arises. + */ + if (policy->contents_encryption_mode != FSCRYPT_MODE_AES_256_XTS) { + fscrypt_warn(inode, + "Can't use %s policy with contents mode other than AES-256-XTS", + type); + return false; + } + + /* * It's unsafe to include inode numbers in the IVs if the filesystem can * potentially renumber inodes, e.g. via filesystem shrinking. */ @@ -338,7 +352,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) union fscrypt_context ctx; int ret; - ci = READ_ONCE(inode->i_crypt_info); + ci = fscrypt_get_info(inode); if (ci) { /* key available, use the cached policy */ *policy = ci->ci_policy; @@ -529,7 +543,7 @@ int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) if (!fscrypt_context_is_valid(&ctx, ret)) return -EINVAL; if (copy_to_user(arg, fscrypt_context_nonce(&ctx), - FS_KEY_DERIVATION_NONCE_SIZE)) + FSCRYPT_FILE_NONCE_SIZE)) return -EFAULT; return 0; } @@ -627,7 +641,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, if (res < 0) return res; - ci = READ_ONCE(parent->i_crypt_info); + ci = fscrypt_get_info(parent); if (ci == NULL) return -ENOKEY; diff --git a/fs/direct-io.c b/fs/direct-io.c index 6d5370eac2a8..183299892465 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1387,8 +1387,8 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * Attempt to prefetch the pieces we likely need later. */ prefetch(&bdev->bd_disk->part_tbl); - prefetch(bdev->bd_queue); - prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); + prefetch(bdev->bd_disk->queue); + prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES); return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block, end_io, submit_io, flags); diff --git a/fs/efs/super.c b/fs/efs/super.c index 4a6ebff2af76..a4a945d0ac6a 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/vfs.h> +#include <linux/blkdev.h> #include "efs.h" #include <linux/efs_vh.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 10dd470876b3..44bad4bb8831 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1096,7 +1096,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, } if (unlikely(err)) { page_zero_new_buffers(page, from, to); - } else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) { + } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -3737,7 +3737,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) goto unlock; - if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { + if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); err = fscrypt_decrypt_pagecache_blocks(page, blocksize, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index de6fe969f773..defd2e10dfd1 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -402,6 +402,7 @@ static void io_submit_init_bio(struct ext4_io_submit *io, * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; @@ -418,7 +419,8 @@ static void io_submit_add_bh(struct ext4_io_submit *io, { int ret; - if (io->io_bio && bh->b_blocknr != io->io_next_block) { + if (io->io_bio && (bh->b_blocknr != io->io_next_block || + !fscrypt_mergeable_bio_bh(io->io_bio, bh))) { submit_and_retry: ext4_io_submit(io); } @@ -506,7 +508,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * (e.g. holes) to be unnecessarily encrypted, but this is rare and * can't happen in the common case of blocksize == PAGE_SIZE. */ - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) { + if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) { gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5761e9961682..f2df2db0786c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -195,7 +195,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio, { unsigned int post_read_steps = 0; - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) + if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (ext4_need_verity(inode, first_idx)) @@ -230,6 +230,7 @@ int ext4_mpage_readpages(struct inode *inode, const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; + sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; @@ -258,7 +259,8 @@ int ext4_mpage_readpages(struct inode *inode, if (page_has_buffers(page)) goto confused; - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); + block_in_file = next_block = + (sector_t)page->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; @@ -358,7 +360,8 @@ int ext4_mpage_readpages(struct inode *inode, * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != blocks[0] - 1)) { + if (bio && (last_block_in_bio != blocks[0] - 1 || + !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); bio = NULL; @@ -370,6 +373,8 @@ int ext4_mpage_readpages(struct inode *inode, */ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + fscrypt_set_bio_crypt_ctx(bio, inode, next_block, + GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, page->index); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 330957ed1f05..0907f907c47d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1508,6 +1508,7 @@ enum { Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, + Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, @@ -1610,6 +1611,7 @@ static const match_table_t tokens = { {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_inlinecrypt, "inlinecrypt"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ {Opt_removed, "check=none"}, /* mount option from ext2/3 */ @@ -1946,6 +1948,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, case Opt_nolazytime: sb->s_flags &= ~SB_LAZYTIME; return 1; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + sb->s_flags |= SB_INLINECRYPT; +#else + ext4_msg(sb, KERN_ERR, "inline encryption not supported"); +#endif + return 1; } for (m = ext4_mount_opts; m->token != Opt_err; m++) @@ -2404,6 +2413,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, fscrypt_show_test_dummy_encryption(seq, sep, sb); + if (sb->s_flags & SB_INLINECRYPT) + SEQ_OPTS_PUTS("inlinecrypt"); + if (test_opt(sb, DAX_ALWAYS)) { if (IS_EXT2_SB(sb)) SEQ_OPTS_PUTS("dax"); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 1e02a8c106b0..29e50fbe7eca 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1086,7 +1086,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, .submitted = false, .io_type = io_type, .io_wbc = wbc, - .encrypted = f2fs_encrypted_file(cc->inode), + .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode), }; struct dnode_of_data dn; struct node_info ni; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 326c63879ddc..b9642607c07d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -14,6 +14,7 @@ #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> +#include <linux/blk-crypto.h> #include <linux/swap.h> #include <linux/prefetch.h> #include <linux/uio.h> @@ -459,6 +460,33 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) return bio; } +static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, + pgoff_t first_idx, + const struct f2fs_io_info *fio, + gfp_t gfp_mask) +{ + /* + * The f2fs garbage collector sets ->encrypted_page when it wants to + * read/write raw data without encryption. + */ + if (!fio || !fio->encrypted_page) + fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); +} + +static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, + pgoff_t next_idx, + const struct f2fs_io_info *fio) +{ + /* + * The f2fs garbage collector sets ->encrypted_page when it wants to + * read/write raw data without encryption. + */ + if (fio && fio->encrypted_page) + return !bio_has_crypt_ctx(bio); + + return fscrypt_mergeable_bio(bio, inode, next_idx); +} + static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { @@ -684,6 +712,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) /* Allocate a new bio */ bio = __bio_alloc(fio, 1); + f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, + fio->page->index, fio, GFP_NOIO); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; @@ -763,9 +794,10 @@ static void del_bio_entry(struct bio_entry *be) kmem_cache_free(bio_entry_slab, be); } -static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, +static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct page *page) { + struct f2fs_sb_info *sbi = fio->sbi; enum temp_type temp; bool found = false; int ret = -EAGAIN; @@ -782,13 +814,19 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, found = true; - if (bio_add_page(*bio, page, PAGE_SIZE, 0) == - PAGE_SIZE) { + f2fs_bug_on(sbi, !page_is_mergeable(sbi, *bio, + *fio->last_block, + fio->new_blkaddr)); + if (f2fs_crypt_mergeable_bio(*bio, + fio->page->mapping->host, + fio->page->index, fio) && + bio_add_page(*bio, page, PAGE_SIZE, 0) == + PAGE_SIZE) { ret = 0; break; } - /* bio is full */ + /* page can't be merged into bio; submit the bio */ del_bio_entry(be); __submit_bio(sbi, *bio, DATA); break; @@ -880,11 +918,13 @@ alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_PAGES); __attach_io_flag(fio); + f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, + fio->page->index, fio, GFP_NOIO); bio_set_op_attrs(bio, fio->op, fio->op_flags); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { - if (add_ipu_page(fio->sbi, &bio, page)) + if (add_ipu_page(fio, &bio, page)) goto alloc_new; } @@ -936,8 +976,11 @@ next: inc_page_count(sbi, WB_DATA_TYPE(bio_page)); - if (io->bio && !io_is_mergeable(sbi, io->bio, io, fio, - io->last_block_in_bio, fio->new_blkaddr)) + if (io->bio && + (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, + fio->new_blkaddr) || + !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host, + bio_page->index, fio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -949,6 +992,8 @@ alloc_new: goto skip; } io->bio = __bio_alloc(fio, BIO_MAX_PAGES); + f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, + bio_page->index, fio, GFP_NOIO); io->fio = *fio; } @@ -993,11 +1038,14 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, for_write); if (!bio) return ERR_PTR(-ENOMEM); + + f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); + f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; bio_set_op_attrs(bio, REQ_OP_READ, op_flag); - if (f2fs_encrypted_file(inode)) + if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (f2fs_compressed_file(inode)) post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; @@ -2073,8 +2121,9 @@ zero_out: * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && !page_is_mergeable(F2FS_I_SB(inode), bio, - *last_block_in_bio, block_nr)) { + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, + *last_block_in_bio, block_nr) || + !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; @@ -2204,8 +2253,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); - if (bio && !page_is_mergeable(sbi, bio, - *last_block_in_bio, blkaddr)) { + if (bio && (!page_is_mergeable(sbi, bio, + *last_block_in_bio, blkaddr) || + !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: __submit_bio(sbi, bio, DATA); bio = NULL; @@ -2421,6 +2471,9 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio) /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); + if (fscrypt_inode_uses_inline_crypto(inode)) + return 0; + retry_encrypt: fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, PAGE_SIZE, 0, gfp_flags); @@ -2594,7 +2647,7 @@ got_it: f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); if (err) { - if (f2fs_encrypted_file(inode)) + if (fscrypt_inode_uses_fs_layer_crypto(inode)) fscrypt_finalize_bounce_page(&fio->encrypted_page); if (PageWriteback(page)) end_page_writeback(page); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 20e56b0fa46a..23c49c313fb6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -138,6 +138,7 @@ enum { Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, + Opt_inlinecrypt, Opt_checkpoint_disable, Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, @@ -204,6 +205,7 @@ static match_table_t f2fs_tokens = { {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_inlinecrypt, "inlinecrypt"}, {Opt_checkpoint_disable, "checkpoint=disable"}, {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, @@ -833,6 +835,13 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (ret) return ret; break; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + sb->s_flags |= SB_INLINECRYPT; +#else + f2fs_info(sbi, "inline encryption not supported"); +#endif + break; case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) return -EINVAL; @@ -1590,6 +1599,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); + if (sbi->sb->s_flags & SB_INLINECRYPT) + seq_puts(seq, ",inlinecrypt"); + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) @@ -1624,6 +1636,8 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).compress_ext_cnt = 0; F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; + sbi->sb->s_flags &= ~SB_INLINECRYPT; + set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); @@ -2470,6 +2484,25 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, *lblk_bits_ret = 8 * sizeof(block_t); } +static int f2fs_get_num_devices(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (f2fs_is_multi_device(sbi)) + return sbi->s_ndevs; + return 1; +} + +static void f2fs_get_devices(struct super_block *sb, + struct request_queue **devs) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + devs[i] = bdev_get_queue(FDEV(i).bdev); +} + static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, @@ -2479,6 +2512,8 @@ static const struct fscrypt_operations f2fs_cryptops = { .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, + .get_num_devices = f2fs_get_num_devices, + .get_devices = f2fs_get_devices, }; #endif diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 2f224b98ee94..f35a37c65e5f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -17,6 +17,7 @@ #include <linux/cred.h> #include <linux/uio.h> #include <linux/xattr.h> +#include <linux/blkdev.h> #include "hfs_fs.h" #include "btree.h" diff --git a/fs/internal.h b/fs/internal.h index 9b863a7bd708..969988d3d397 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,7 +23,9 @@ struct user_namespace; extern void __init bdev_cache_init(void); extern int __sync_blockdev(struct block_device *bdev, int wait); - +void iterate_bdevs(void (*)(struct block_device *, void *), void *); +void emergency_thaw_bdev(struct super_block *sb); +void bd_forget(struct inode *inode); #else static inline void bdev_cache_init(void) { @@ -33,7 +35,18 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) { return 0; } -#endif +static inline void iterate_bdevs(void (*f)(struct block_device *, void *), + void *arg) +{ +} +static inline int emergency_thaw_bdev(struct super_block *sb) +{ + return 0; +} +static inline void bd_forget(struct inode *inode) +{ +} +#endif /* CONFIG_BLOCK */ /* * buffer.c diff --git a/fs/io-wq.c b/fs/io-wq.c index 47c5f3aeb460..e92c4724480c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -462,6 +462,7 @@ static void io_impersonate_work(struct io_worker *worker, io_wq_switch_mm(worker, work); if (worker->cur_creds != work->creds) io_wq_switch_creds(worker, work); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; } static void io_assign_current_work(struct io_worker *worker, @@ -489,7 +490,6 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; - unsigned int hash; get_next: /* * If we got some work, mark us as busy. If we didn't, but @@ -512,6 +512,7 @@ get_next: /* handle a whole dependent link */ do { struct io_wq_work *old_work, *next_hashed, *linked; + unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); io_impersonate_work(worker, work); @@ -522,10 +523,8 @@ get_next: if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; - hash = io_get_work_hash(work); - linked = old_work = work; - wq->do_work(&linked); - linked = (old_work == linked) ? NULL : linked; + old_work = work; + linked = wq->do_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { @@ -542,8 +541,6 @@ get_next: spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; - /* dependent work is not hashed */ - hash = -1U; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; @@ -781,8 +778,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) struct io_wq_work *old_work = work; work->flags |= IO_WQ_WORK_CANCEL; - wq->do_work(&work); - work = (work == old_work) ? NULL : work; + work = wq->do_work(work); wq->free_work(old_work); } while (work); } diff --git a/fs/io-wq.h b/fs/io-wq.h index 071f1a997800..ddaf9614cf9b 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -5,10 +5,10 @@ struct io_wq; enum { IO_WQ_WORK_CANCEL = 1, - IO_WQ_WORK_HASHED = 4, - IO_WQ_WORK_UNBOUND = 32, - IO_WQ_WORK_NO_CANCEL = 256, - IO_WQ_WORK_CONCURRENT = 512, + IO_WQ_WORK_HASHED = 2, + IO_WQ_WORK_UNBOUND = 4, + IO_WQ_WORK_NO_CANCEL = 8, + IO_WQ_WORK_CONCURRENT = 16, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -89,6 +89,7 @@ struct io_wq_work { struct mm_struct *mm; const struct cred *creds; struct fs_struct *fs; + unsigned long fsize; unsigned flags; }; @@ -101,7 +102,7 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) } typedef void (free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work **); +typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *); struct io_wq_data { struct user_struct *user; diff --git a/fs/io_uring.c b/fs/io_uring.c index 32b0064f806e..2a3af95be4ca 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,6 +78,7 @@ #include <linux/fs_struct.h> #include <linux/splice.h> #include <linux/task_work.h> +#include <linux/pagemap.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -226,7 +227,7 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int account_mem: 1; + unsigned int limit_mem: 1; unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; @@ -319,12 +320,12 @@ struct io_ring_ctx { spinlock_t completion_lock; /* - * ->poll_list is protected by the ctx->uring_lock for + * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head poll_list; + struct list_head iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_file; @@ -395,6 +396,7 @@ struct io_timeout { int flags; u32 off; u32 target_seq; + struct list_head list; }; struct io_rw { @@ -413,7 +415,7 @@ struct io_connect { struct io_sr_msg { struct file *file; union { - struct user_msghdr __user *msg; + struct user_msghdr __user *umsg; void __user *buf; }; int msg_flags; @@ -486,6 +488,12 @@ struct io_statx { struct statx __user *buffer; }; +struct io_completion { + struct file *file; + struct list_head list; + int cflags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -503,6 +511,7 @@ struct io_async_rw { struct iovec *iov; ssize_t nr_segs; ssize_t size; + struct wait_page_queue wpq; }; struct io_async_ctx { @@ -523,23 +532,18 @@ enum { REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, REQ_F_LINK_HEAD_BIT, - REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, - REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_MUST_PUNT_BIT, - REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, - REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, REQ_F_TASK_PINNED_BIT, @@ -563,8 +567,6 @@ enum { /* head of a link */ REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), - /* already grabbed next link */ - REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), /* on inflight list */ @@ -575,14 +577,8 @@ enum { REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), /* has linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* timeout request */ - REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* must be punted even for NONBLOCK */ - REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), - /* no timeout sequence */ - REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ @@ -595,8 +591,6 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* doesn't need file table for this request */ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), - /* needs to queue linked timeout */ - REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), /* req->task is refcounted */ @@ -606,7 +600,6 @@ enum { struct async_poll { struct io_poll_iocb poll; struct io_poll_iocb *double_poll; - struct io_wq_work work; }; /* @@ -635,51 +628,54 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; + /* use only after cleaning per-op data, see io_clean_op() */ + struct io_completion compl; }; struct io_async_ctx *io; - int cflags; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; u16 buf_index; + u32 result; - struct io_ring_ctx *ctx; - struct list_head list; - unsigned int flags; - refcount_t refs; - struct task_struct *task; - unsigned long fsize; - u64 user_data; - u32 result; - u32 sequence; - - struct list_head link_list; + struct io_ring_ctx *ctx; + unsigned int flags; + refcount_t refs; + struct task_struct *task; + u64 user_data; - struct list_head inflight_entry; + struct list_head link_list; - struct percpu_ref *fixed_file_refs; + /* + * 1. used with ctx->iopoll_list with reads/writes + * 2. to track reqs with ->files (see io_op_def::file_table) + */ + struct list_head inflight_entry; + + struct percpu_ref *fixed_file_refs; + struct callback_head task_work; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + struct hlist_node hash_node; + struct async_poll *apoll; + struct io_wq_work work; +}; - union { - /* - * Only commands that never go async can use the below fields, - * obviously. Right now only IORING_OP_POLL_ADD uses them, and - * async armed poll handlers for regular commands. The latter - * restore the work, if needed. - */ - struct { - struct callback_head task_work; - struct hlist_node hash_node; - struct async_poll *apoll; - }; - struct io_wq_work work; - }; +struct io_defer_entry { + struct list_head list; + struct io_kiocb *req; + u32 seq; }; -#define IO_PLUG_THRESHOLD 2 #define IO_IOPOLL_BATCH 8 +struct io_comp_state { + unsigned int nr; + struct list_head list; + struct io_ring_ctx *ctx; +}; + struct io_submit_state { struct blk_plug plug; @@ -690,12 +686,16 @@ struct io_submit_state { unsigned int free_reqs; /* + * Batch completion logic + */ + struct io_comp_state comp; + + /* * File reference cache */ struct file *file; unsigned int fd; unsigned int has_refs; - unsigned int used_refs; unsigned int ios_left; }; @@ -723,6 +723,7 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + unsigned needs_fsize : 1; }; static const struct io_op_def io_op_defs[] = { @@ -742,6 +743,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -756,6 +758,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -808,6 +811,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .needs_fsize = 1, }, [IORING_OP_OPENAT] = { .file_table = 1, @@ -839,6 +843,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -881,22 +886,37 @@ static const struct io_op_def io_op_defs[] = { }, }; -static void io_wq_submit_work(struct io_wq_work **workptr); +enum io_mem_account { + ACCT_LOCKED, + ACCT_PINNED, +}; + +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); +static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_grab_files(struct io_kiocb *req); -static void io_complete_rw_common(struct kiocb *kiocb, long res); -static void io_cleanup_req(struct io_kiocb *req); +static int io_prep_work_files(struct io_kiocb *req); +static void __io_clean_op(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); static void __io_queue_sqe(struct io_kiocb *req, - const struct io_uring_sqe *sqe); + const struct io_uring_sqe *sqe, + struct io_comp_state *cs); +static void io_file_put_work(struct work_struct *work); + +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock); +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter); static struct kmem_cache *req_cachep; @@ -923,6 +943,12 @@ static void io_get_req_task(struct io_kiocb *req) req->flags |= REQ_F_TASK_PINNED; } +static inline void io_clean_op(struct io_kiocb *req) +{ + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) + __io_clean_op(req); +} + /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ static void __io_put_req_task(struct io_kiocb *req) { @@ -930,7 +956,41 @@ static void __io_put_req_task(struct io_kiocb *req) put_task_struct(req->task); } -static void io_file_put_work(struct work_struct *work); +static void io_sq_thread_drop_mm(void) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) +{ + if (!current->mm) { + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || + !mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_mm) + return 0; + return __io_sq_thread_acquire_mm(ctx); +} + +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} /* * Note: must call io_req_init_async() for the first time you @@ -957,6 +1017,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +static inline bool io_is_timeout_noseq(struct io_kiocb *req) +{ + return !req->timeout.off; +} + static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -1000,7 +1065,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - INIT_LIST_HEAD(&ctx->poll_list); + INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -1017,18 +1082,14 @@ err: return NULL; } -static inline bool __req_need_defer(struct io_kiocb *req) +static bool req_need_defer(struct io_kiocb *req, u32 seq) { - struct io_ring_ctx *ctx = req->ctx; + if (unlikely(req->flags & REQ_F_IO_DRAIN)) { + struct io_ring_ctx *ctx = req->ctx; - return req->sequence != ctx->cached_cq_tail + return seq != ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow); -} - -static inline bool req_need_defer(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) - return __req_need_defer(req); + } return false; } @@ -1046,28 +1107,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_req_work_grab_env(struct io_kiocb *req, - const struct io_op_def *def) -{ - if (!req->work.mm && def->needs_mm) { - mmgrab(current->mm); - req->work.mm = current->mm; - } - if (!req->work.creds) - req->work.creds = get_current_cred(); - if (!req->work.fs && def->needs_fs) { - spin_lock(¤t->fs->lock); - if (!current->fs->in_exec) { - req->work.fs = current->fs; - req->work.fs->users++; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } -} - -static inline void io_req_work_drop_env(struct io_kiocb *req) +static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) return; @@ -1089,11 +1129,12 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) spin_unlock(&req->work.fs->lock); if (fs) free_fs_struct(fs); + req->work.fs = NULL; } + req->flags &= ~REQ_F_WORK_INITIALIZED; } -static inline void io_prep_async_work(struct io_kiocb *req, - struct io_kiocb **link) +static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1106,18 +1147,42 @@ static inline void io_prep_async_work(struct io_kiocb *req, if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); + if (!req->work.fs && def->needs_fs) { + spin_lock(¤t->fs->lock); + if (!current->fs->in_exec) { + req->work.fs = current->fs; + req->work.fs->users++; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } + if (def->needs_fsize) + req->work.fsize = rlimit(RLIMIT_FSIZE); + else + req->work.fsize = RLIM_INFINITY; +} - io_req_work_grab_env(req, def); +static void io_prep_async_link(struct io_kiocb *req) +{ + struct io_kiocb *cur; - *link = io_prep_linked_timeout(req); + io_prep_async_work(req); + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(cur, &req->link_list, link_list) + io_prep_async_work(cur); } -static inline void io_queue_async_work(struct io_kiocb *req) +static void __io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; - - io_prep_async_work(req, &link); + struct io_kiocb *link = io_prep_linked_timeout(req); trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); @@ -1127,14 +1192,22 @@ static inline void io_queue_async_work(struct io_kiocb *req) io_queue_linked_timeout(link); } +static void io_queue_async_work(struct io_kiocb *req) +{ + /* init ->work of the whole link before punting */ + io_prep_async_link(req); + __io_queue_async_work(req); +} + static void io_kill_timeout(struct io_kiocb *req) { int ret; ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { - atomic_inc(&req->ctx->cq_timeouts); - list_del_init(&req->list); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + list_del_init(&req->timeout.list); req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); io_put_req(req); @@ -1146,7 +1219,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) struct io_kiocb *req, *tmp; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) io_kill_timeout(req); spin_unlock_irq(&ctx->completion_lock); } @@ -1154,13 +1227,15 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) static void __io_queue_deferred(struct io_ring_ctx *ctx) { do { - struct io_kiocb *req = list_first_entry(&ctx->defer_list, - struct io_kiocb, list); + struct io_defer_entry *de = list_first_entry(&ctx->defer_list, + struct io_defer_entry, list); - if (req_need_defer(req)) + if (req_need_defer(de->req, de->seq)) break; - list_del_init(&req->list); - io_queue_async_work(req); + list_del_init(&de->list); + /* punt-init is done before queueing for defer */ + __io_queue_async_work(de->req); + kfree(de); } while (!list_empty(&ctx->defer_list)); } @@ -1168,15 +1243,15 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->timeout_list)) { struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, list); + struct io_kiocb, timeout.list); - if (req->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(req)) break; if (req->timeout.target_seq != ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts)) break; - list_del_init(&req->list); + list_del_init(&req->timeout.list); io_kill_timeout(req); } } @@ -1229,6 +1304,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } +static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) +{ + if (list_empty(&ctx->cq_overflow_list)) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + } +} + /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1259,13 +1343,13 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) break; req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, - list); - list_move(&req->list, &list); + compl.list); + list_move(&req->compl.list, &list); req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, req->cflags); + WRITE_ONCE(cqe->flags, req->compl.cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1273,17 +1357,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); - if (cqe) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } + io_cqring_mark_overflow(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); while (!list_empty(&list)) { - req = list_first_entry(&list, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(&list, struct io_kiocb, compl.list); + list_del(&req->compl.list); io_put_req(req); } @@ -1316,11 +1397,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) set_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } + io_clean_op(req); req->flags |= REQ_F_OVERFLOW; - refcount_inc(&req->refs); req->result = res; - req->cflags = cflags; - list_add_tail(&req->list, &ctx->cq_overflow_list); + req->compl.cflags = cflags; + refcount_inc(&req->refs); + list_add_tail(&req->compl.list, &ctx->cq_overflow_list); } } @@ -1329,7 +1411,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) __io_cqring_fill_event(req, res, 0); } -static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) +static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; @@ -1342,9 +1424,52 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) io_cqring_ev_posted(ctx); } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void io_submit_flush_completions(struct io_comp_state *cs) +{ + struct io_ring_ctx *ctx = cs->ctx; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&cs->list)) { + struct io_kiocb *req; + + req = list_first_entry(&cs->list, struct io_kiocb, compl.list); + list_del(&req->compl.list); + __io_cqring_fill_event(req, req->result, req->compl.cflags); + if (!(req->flags & REQ_F_LINK_HEAD)) { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } else { + spin_unlock_irq(&ctx->completion_lock); + io_put_req(req); + spin_lock_irq(&ctx->completion_lock); + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + cs->nr = 0; +} + +static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, + struct io_comp_state *cs) { - __io_cqring_add_event(req, res, 0); + if (!cs) { + io_cqring_add_event(req, res, cflags); + io_put_req(req); + } else { + io_clean_op(req); + req->result = res; + req->compl.cflags = cflags; + list_add_tail(&req->compl.list, &cs->list); + if (++cs->nr >= 32) + io_submit_flush_completions(cs); + } +} + +static void io_req_complete(struct io_kiocb *req, long res) +{ + __io_req_complete(req, res, 0, NULL); } static inline bool io_is_fallback_req(struct io_kiocb *req) @@ -1370,11 +1495,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!state) { - req = kmem_cache_alloc(req_cachep, gfp); - if (unlikely(!req)) - goto fallback; - } else if (!state->free_reqs) { + if (!state->free_reqs) { size_t sz; int ret; @@ -1412,21 +1533,15 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static void __io_req_aux_free(struct io_kiocb *req) +static void io_dismantle_req(struct io_kiocb *req) { - if (req->flags & REQ_F_NEED_CLEANUP) - io_cleanup_req(req); + io_clean_op(req); - kfree(req->io); + if (req->io) + kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - __io_put_req_task(req); - io_req_work_drop_env(req); -} - -static void __io_free_req(struct io_kiocb *req) -{ - __io_req_aux_free(req); + io_req_clean_work(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1438,57 +1553,20 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - - percpu_ref_put(&req->ctx->refs); - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); - else - clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); } -struct req_batch { - void *reqs[IO_IOPOLL_BATCH]; - int to_free; - int need_iter; -}; - -static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) +static void __io_free_req(struct io_kiocb *req) { - if (!rb->to_free) - return; - if (rb->need_iter) { - int i, inflight = 0; - unsigned long flags; - - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; - - if (req->flags & REQ_F_INFLIGHT) - inflight++; - __io_req_aux_free(req); - } - if (!inflight) - goto do_free; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; - - if (req->flags & REQ_F_INFLIGHT) { - list_del(&req->inflight_entry); - if (!--inflight) - break; - } - } - spin_unlock_irqrestore(&ctx->inflight_lock, flags); + struct io_ring_ctx *ctx; - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); - } -do_free: - kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = rb->need_iter = 0; + io_dismantle_req(req); + __io_put_req_task(req); + ctx = req->ctx; + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req); + percpu_ref_put(&ctx->refs); } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1508,53 +1586,67 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) return false; } -static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static bool __io_kill_linked_timeout(struct io_kiocb *req) +{ + struct io_kiocb *link; + bool wake_ev; + + if (list_empty(&req->link_list)) + return false; + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + if (link->opcode != IORING_OP_LINK_TIMEOUT) + return false; + + list_del_init(&link->link_list); + wake_ev = io_link_cancel_timeout(link); + req->flags &= ~REQ_F_LINK_TIMEOUT; + return wake_ev; +} + +static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - bool wake_ev = false; + bool wake_ev; - /* Already got next link */ - if (req->flags & REQ_F_LINK_NEXT) - return; + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + wake_ev = __io_kill_linked_timeout(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + wake_ev = __io_kill_linked_timeout(req); + } + + if (wake_ev) + io_cqring_ev_posted(ctx); +} + +static struct io_kiocb *io_req_link_next(struct io_kiocb *req) +{ + struct io_kiocb *nxt; /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the * safe side. */ - while (!list_empty(&req->link_list)) { - struct io_kiocb *nxt = list_first_entry(&req->link_list, - struct io_kiocb, link_list); - - if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT))) { - list_del_init(&nxt->link_list); - wake_ev |= io_link_cancel_timeout(nxt); - req->flags &= ~REQ_F_LINK_TIMEOUT; - continue; - } - - list_del_init(&req->link_list); - if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK_HEAD; - *nxtptr = nxt; - break; - } + if (unlikely(list_empty(&req->link_list))) + return NULL; - req->flags |= REQ_F_LINK_NEXT; - if (wake_ev) - io_cqring_ev_posted(ctx); + nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK_HEAD; + return nxt; } /* * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void io_fail_links(struct io_kiocb *req) +static void __io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, @@ -1563,25 +1655,37 @@ static void io_fail_links(struct io_kiocb *req) list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->opcode == IORING_OP_LINK_TIMEOUT) { - io_link_cancel_timeout(link); - } else { - io_cqring_fill_event(link, -ECANCELED); - __io_double_put_req(link); - } + io_cqring_fill_event(link, -ECANCELED); + __io_double_put_req(link); req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } -static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +static void io_fail_links(struct io_kiocb *req) { - if (likely(!(req->flags & REQ_F_LINK_HEAD))) - return; + struct io_ring_ctx *ctx = req->ctx; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + __io_fail_links(req); + } + + io_cqring_ev_posted(ctx); +} + +static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) +{ + req->flags &= ~REQ_F_LINK_HEAD; + if (req->flags & REQ_F_LINK_TIMEOUT) + io_kill_linked_timeout(req); /* * If LINK is set, we have dependent requests in this chain. If we @@ -1589,62 +1693,187 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_FAIL_LINK) { - io_fail_links(req); - } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) == - REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; + if (likely(!(req->flags & REQ_F_FAIL_LINK))) + return io_req_link_next(req); + io_fail_links(req); + return NULL; +} - /* - * If this is a timeout link, we could be racing with the - * timeout timer. Grab the completion lock for this case to - * protect against that. - */ - spin_lock_irqsave(&ctx->completion_lock, flags); - io_req_link_next(req, nxt); - spin_unlock_irqrestore(&ctx->completion_lock, flags); +static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_LINK_HEAD))) + return NULL; + return __io_req_find_next(req); +} + +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +{ + struct task_struct *tsk = req->task; + struct io_ring_ctx *ctx = req->ctx; + int ret, notify = TWA_RESUME; + + /* + * SQPOLL kernel thread doesn't need notification, just a wakeup. + * If we're not using an eventfd, then TWA_RESUME is always fine, + * as we won't have dependencies between request completions for + * other kernel wait conditions. + */ + if (ctx->flags & IORING_SETUP_SQPOLL) + notify = 0; + else if (ctx->cq_ev_fd) + notify = TWA_SIGNAL; + + ret = task_work_add(tsk, cb, notify); + if (!ret) + wake_up_process(tsk); + return ret; +} + +static void __io_req_task_cancel(struct io_kiocb *req, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(req, error); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); +} + +static void io_req_task_cancel(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + __io_req_task_cancel(req, -ECANCELED); +} + +static void __io_req_task_submit(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!__io_sq_thread_acquire_mm(ctx)) { + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL, NULL); + mutex_unlock(&ctx->uring_lock); } else { - io_req_link_next(req, nxt); + __io_req_task_cancel(req, -EFAULT); } } -static void io_free_req(struct io_kiocb *req) +static void io_req_task_submit(struct callback_head *cb) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - io_req_find_next(req, &nxt); - __io_free_req(req); + __io_req_task_submit(req); +} + +static void io_req_task_queue(struct io_kiocb *req) +{ + int ret; + + init_task_work(&req->task_work, io_req_task_submit); + + ret = io_req_task_work_add(req, &req->task_work); + if (unlikely(ret)) { + struct task_struct *tsk; + + init_task_work(&req->task_work, io_req_task_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); + } +} + +static void io_queue_next(struct io_kiocb *req) +{ + struct io_kiocb *nxt = io_req_find_next(req); if (nxt) - io_queue_async_work(nxt); + io_req_task_queue(nxt); } -static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) +static void io_free_req(struct io_kiocb *req) { - struct io_kiocb *link; - const struct io_op_def *def = &io_op_defs[nxt->opcode]; + io_queue_next(req); + __io_free_req(req); +} - if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) - io_wq_hash_work(&nxt->work, file_inode(nxt->file)); +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; - *workptr = &nxt->work; - link = io_prep_linked_timeout(nxt); - if (link) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; + struct task_struct *task; + int task_refs; +}; + +static inline void io_init_req_batch(struct req_batch *rb) +{ + rb->to_free = 0; + rb->task_refs = 0; + rb->task = NULL; +} + +static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = 0; +} + +static void io_req_free_batch_finish(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + if (rb->to_free) + __io_req_free_batch_flush(ctx, rb); + if (rb->task) { + put_task_struct_many(rb->task, rb->task_refs); + rb->task = NULL; + } +} + +static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) +{ + if (unlikely(io_is_fallback_req(req))) { + io_free_req(req); + return; + } + if (req->flags & REQ_F_LINK_HEAD) + io_queue_next(req); + + if (req->flags & REQ_F_TASK_PINNED) { + if (req->task != rb->task) { + if (rb->task) + put_task_struct_many(rb->task, rb->task_refs); + rb->task = req->task; + rb->task_refs = 0; + } + rb->task_refs++; + req->flags &= ~REQ_F_TASK_PINNED; + } + + io_dismantle_req(req); + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + __io_req_free_batch_flush(req->ctx, rb); } /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ -__attribute__((nonnull)) -static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) { + struct io_kiocb *nxt = NULL; + if (refcount_dec_and_test(&req->refs)) { - io_req_find_next(req, nxtptr); + nxt = io_req_find_next(req); __io_free_req(req); } + return nxt; } static void io_put_req(struct io_kiocb *req) @@ -1653,24 +1882,20 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_steal_work(struct io_kiocb *req, - struct io_wq_work **workptr) +static struct io_wq_work *io_steal_work(struct io_kiocb *req) { + struct io_kiocb *nxt; + /* - * It's in an io-wq worker, so there always should be at least - * one reference, which will be dropped in io_put_work() just - * after the current handler returns. - * - * It also means, that if the counter dropped to 1, then there is - * no asynchronous users left, so it's safe to steal the next work. + * A ref is owned by io-wq in which context we're. So, if that's the + * last one, it's safe to steal next work. False negatives are Ok, + * it just will be re-punted async in io_put_work() */ - if (refcount_read(&req->refs) == 1) { - struct io_kiocb *nxt = NULL; + if (refcount_read(&req->refs) != 1) + return NULL; - io_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); - } + nxt = io_req_find_next(req); + return nxt ? &nxt->work : NULL; } /* @@ -1720,31 +1945,34 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) { - if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) - return false; + unsigned int cflags; - if (req->file || req->io) - rb->need_iter++; - - rb->reqs[rb->to_free++] = req; - if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) - io_free_req_many(req->ctx, rb); - return true; + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + cflags |= IORING_CQE_F_BUFFER; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(kbuf); + return cflags; } -static int io_put_kbuf(struct io_kiocb *req) +static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) { struct io_buffer *kbuf; - int cflags; kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; - cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - cflags |= IORING_CQE_F_BUFFER; - req->rw.addr = 0; - kfree(kbuf); - return cflags; + return io_put_kbuf(req, kbuf); +} + +static inline bool io_run_task_work(void) +{ + if (current->task_works) { + __set_current_state(TASK_RUNNING); + task_work_run(); + return true; + } + + return false; } static void io_iopoll_queue(struct list_head *again) @@ -1752,18 +1980,9 @@ static void io_iopoll_queue(struct list_head *again) struct io_kiocb *req; do { - req = list_first_entry(again, struct io_kiocb, list); - list_del(&req->list); - - /* shouldn't happen unless io_uring is dying, cancel reqs */ - if (unlikely(!current->mm)) { - io_complete_rw_common(&req->rw.kiocb, -EAGAIN); - io_put_req(req); - continue; - } - - refcount_inc(&req->refs); - io_queue_async_work(req); + req = list_first_entry(again, struct io_kiocb, inflight_entry); + list_del(&req->inflight_entry); + __io_complete_rw(req, -EAGAIN, 0, NULL); } while (!list_empty(again)); } @@ -1780,33 +1999,32 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); - rb.to_free = rb.need_iter = 0; + io_init_req_batch(&rb); while (!list_empty(done)) { int cflags = 0; - req = list_first_entry(done, struct io_kiocb, list); + req = list_first_entry(done, struct io_kiocb, inflight_entry); if (READ_ONCE(req->result) == -EAGAIN) { req->iopoll_completed = 0; - list_move_tail(&req->list, &again); + list_move_tail(&req->inflight_entry, &again); continue; } - list_del(&req->list); + list_del(&req->inflight_entry); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); + cflags = io_put_rw_kbuf(req); __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) - io_free_req(req); + if (refcount_dec_and_test(&req->refs)) + io_req_free_batch(&rb, req); } io_commit_cqring(ctx); if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_ev_posted(ctx); - io_free_req_many(ctx, &rb); + io_req_free_batch_finish(ctx, &rb); if (!list_empty(&again)) io_iopoll_queue(&again); @@ -1827,7 +2045,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; /* @@ -1836,7 +2054,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * and complete those lists first, if we have entries there. */ if (READ_ONCE(req->iopoll_completed)) { - list_move_tail(&req->list, &done); + list_move_tail(&req->inflight_entry, &done); continue; } if (!list_empty(&done)) @@ -1846,6 +2064,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ret < 0) break; + /* iopoll may have completed current req */ + if (READ_ONCE(req->iopoll_completed)) + list_move_tail(&req->inflight_entry, &done); + if (ret && spin) spin = false; ret = 0; @@ -1865,13 +2087,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { - while (!list_empty(&ctx->poll_list) && !need_resched()) { + while (!list_empty(&ctx->iopoll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); if (ret < 0) return ret; - if (!min || *nr_events >= min) + if (*nr_events >= min) return 0; } @@ -1882,29 +2104,37 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static void io_iopoll_reap_events(struct io_ring_ctx *ctx) +static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->poll_list)) { + while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; - io_iopoll_getevents(ctx, &nr_events, 1); + io_do_iopoll(ctx, &nr_events, 0); + /* let it sleep and repeat later if can't complete a request */ + if (nr_events == 0) + break; /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. + * Also let task_work, etc. to progress by releasing the mutex */ - cond_resched(); + if (need_resched()) { + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + } } mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { + unsigned int nr_events = 0; int iters = 0, ret = 0; /* @@ -1914,8 +2144,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ mutex_lock(&ctx->uring_lock); do { - int tmin = 0; - /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that @@ -1936,17 +2164,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); + io_run_task_work(); mutex_lock(&ctx->uring_lock); } - if (*nr_events < min) - tmin = min - *nr_events; - - ret = io_iopoll_getevents(ctx, nr_events, tmin); + ret = io_iopoll_getevents(ctx, &nr_events, min); if (ret <= 0) break; ret = 0; - } while (min && !*nr_events && !need_resched()); + } while (min && !nr_events && !need_resched()); mutex_unlock(&ctx->uring_lock); return ret; @@ -1966,13 +2192,8 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } -static inline void req_set_fail_links(struct io_kiocb *req) -{ - if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; -} - -static void io_complete_rw_common(struct kiocb *kiocb, long res) +static void io_complete_rw_common(struct kiocb *kiocb, long res, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); int cflags = 0; @@ -1983,16 +2204,96 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) if (res != req->result) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); - __io_cqring_add_event(req, res, cflags); + cflags = io_put_rw_kbuf(req); + __io_req_complete(req, res, cflags, cs); +} + +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req, int error) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + ssize_t ret = -ECANCELED; + struct iov_iter iter; + int rw; + + if (error) { + ret = error; + goto end_req; + } + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + rw = READ; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + rw = WRITE; + break; + default: + printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", + req->opcode); + goto end_req; + } + + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + if (!ret) + return true; + kfree(iovec); +end_req: + req_set_fail_links(req); + io_req_complete(req, ret); + return false; +} + +static void io_rw_resubmit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; + int err; + + err = io_sq_thread_acquire_mm(ctx, req); + + if (io_resubmit_prep(req, err)) { + refcount_inc(&req->refs); + io_queue_async_work(req); + } +} +#endif + +static bool io_rw_reissue(struct io_kiocb *req, long res) +{ +#ifdef CONFIG_BLOCK + int ret; + + if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) + return false; + + init_task_work(&req->task_work, io_rw_resubmit); + ret = io_req_task_work_add(req, &req->task_work); + if (!ret) + return true; +#endif + return false; +} + +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs) +{ + if (!io_rw_reissue(req, res)) + io_complete_rw_common(&req->rw.kiocb, res, cs); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - io_complete_rw_common(kiocb, res); - io_put_req(req); + __io_complete_rw(req, res, res2, NULL); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2026,13 +2327,13 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (list_empty(&ctx->poll_list)) { + if (list_empty(&ctx->iopoll_list)) { ctx->poll_multi_file = false; } else if (!ctx->poll_multi_file) { struct io_kiocb *list_req; - list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, - list); + list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, + inflight_entry); if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -2042,9 +2343,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->list, &ctx->poll_list); + list_add(&req->inflight_entry, &ctx->iopoll_list); else - list_add_tail(&req->list, &ctx->poll_list); + list_add_tail(&req->inflight_entry, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sqo_wait)) @@ -2053,10 +2354,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req) static void __io_state_file_put(struct io_submit_state *state) { - int diff = state->has_refs - state->used_refs; - - if (diff) - fput_many(state->file, diff); + if (state->has_refs) + fput_many(state->file, state->has_refs); state->file = NULL; } @@ -2078,7 +2377,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (state->file) { if (state->fd == fd) { - state->used_refs++; + state->has_refs--; state->ios_left--; return state->file; } @@ -2089,12 +2388,20 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return NULL; state->fd = fd; - state->has_refs = state->ios_left; - state->used_refs = 1; state->ios_left--; + state->has_refs = state->ios_left; return state->file; } +static bool io_bdev_nowait(struct block_device *bdev) +{ +#ifdef CONFIG_BLOCK + return !bdev || queue_is_mq(bdev_get_queue(bdev)); +#else + return true; +#endif +} + /* * If we tracked the file through the SCM inflight mechanism, we could support * any file. For now, just ensure that anything potentially problematic is done @@ -2104,10 +2411,19 @@ static bool io_file_supports_async(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) - return true; - if (S_ISREG(mode) && file->f_op != &io_uring_fops) + if (S_ISBLK(mode)) { + if (io_bdev_nowait(file->f_inode->i_bdev)) + return true; + return false; + } + if (S_ISCHR(mode) || S_ISSOCK(mode)) return true; + if (S_ISREG(mode)) { + if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) && + file->f_op != &io_uring_fops) + return true; + return false; + } /* any ->read/write should understand O_NONBLOCK */ if (file->f_flags & O_NONBLOCK) @@ -2158,6 +2474,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; + if (kiocb->ki_flags & IOCB_DIRECT) + io_get_req_task(req); + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; @@ -2168,8 +2487,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; - req->result = 0; req->iopoll_completed = 0; + io_get_req_task(req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -2203,14 +2522,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) - io_complete_rw(kiocb, ret, 0); + __io_complete_rw(req, ret, 0, cs); else io_rw_done(kiocb, ret); } @@ -2466,10 +2786,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->io) { struct io_async_rw *iorw = &req->io->rw; - *iovec = iorw->iov; - iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); - if (iorw->iov == iorw->fast_iov) - *iovec = NULL; + iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size); + *iovec = NULL; return iorw->size; } @@ -2554,15 +2872,17 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - req->io->rw.nr_segs = iter->nr_segs; - req->io->rw.size = io_size; - req->io->rw.iov = iovec; - if (!req->io->rw.iov) { - req->io->rw.iov = req->io->rw.fast_iov; - if (req->io->rw.iov != fast_iov) - memcpy(req->io->rw.iov, fast_iov, + struct io_async_rw *rw = &req->io->rw; + + rw->nr_segs = iter->nr_segs; + rw->size = io_size; + if (!iovec) { + rw->iov = rw->fast_iov; + if (rw->iov != fast_iov) + memcpy(rw->iov, fast_iov, sizeof(struct iovec) * iter->nr_segs); } else { + rw->iov = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } @@ -2596,11 +2916,27 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, return 0; } +static inline int io_rw_prep_async(struct io_kiocb *req, int rw, + bool force_nonblock) +{ + struct io_async_ctx *io = req->io; + struct iov_iter iter; + ssize_t ret; + + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock); + req->io = io; + if (unlikely(ret < 0)) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; +} + static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -2613,75 +2949,169 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; + return io_rw_prep_async(req, READ, force_nonblock); +} - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; +static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, + int sync, void *arg) +{ + struct wait_page_queue *wpq; + struct io_kiocb *req = wait->private; + struct wait_page_key *key = arg; + int ret; - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + wpq = container_of(wait, struct wait_page_queue, wait); + + if (!wake_page_match(wpq, key)) + return 0; + + /* Stop waking things up if the page is locked again */ + if (test_bit(key->bit_nr, &key->page->flags)) + return -1; + + list_del_init(&wait->entry); + + init_task_work(&req->task_work, io_req_task_submit); + /* submit ref gets dropped, acquire a new one */ + refcount_inc(&req->refs); + ret = io_req_task_work_add(req, &req->task_work); + if (unlikely(ret)) { + struct task_struct *tsk; + + /* queue just for cancelation */ + init_task_work(&req->task_work, io_req_task_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); + } + return 1; +} + +static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, + struct wait_page_queue *wait, + wait_queue_func_t func, + void *data) +{ + /* Can't support async wakeup with polled IO */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) { + wait->wait.func = func; + wait->wait.private = data; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_waitq = wait; + return 0; + } + + return -EOPNOTSUPP; +} + + +static bool io_rw_should_retry(struct io_kiocb *req) +{ + struct kiocb *kiocb = &req->rw.kiocb; + int ret; + + /* never retry for NOWAIT, we just complete with -EAGAIN */ + if (req->flags & REQ_F_NOWAIT) + return false; + + /* already tried, or we're doing O_DIRECT */ + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) + return false; + /* + * just use poll if we can, and don't attempt if the fs doesn't + * support callback based unlocks + */ + if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + return false; + + /* + * If request type doesn't require req->io to defer in general, + * we need to allocate it here + */ + if (!req->io && __io_alloc_async_ctx(req)) + return false; + + ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, + io_async_buf_func, req); + if (!ret) { + io_get_req_task(req); + return true; + } + + return false; +} + +static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +{ + if (req->file->f_op->read_iter) + return call_read_iter(req->file, &req->rw.kiocb, iter); + return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); } -static int io_read(struct io_kiocb *req, bool force_nonblock) +static int io_read(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t io_size, ret; + ssize_t io_size, ret, ret2; + unsigned long nr_segs; ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; - req->result = 0; - io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; - - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - ssize_t ret2; + if (unlikely(ret)) + goto out_free; - if (req->file->f_op->read_iter) - ret2 = call_read_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); + ret2 = io_iter_do_read(req, &iter); - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); - } else { + /* Catch -EAGAIN return for forced non-blocking submission */ + if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + /* if we can retry, do so with the callbacks armed */ + if (io_rw_should_retry(req)) { + ret2 = io_iter_do_read(req, &iter); + if (ret2 == -EIOCBQUEUED) { goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; + } else if (ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); + goto out_free; + } } + kiocb->ki_flags &= ~IOCB_WAITQ; + return -EAGAIN; } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } @@ -2689,8 +3119,6 @@ out_free: static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -2700,49 +3128,33 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - req->fsize = rlimit(RLIMIT_FSIZE); - /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; - - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; - - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return io_rw_prep_async(req, WRITE, force_nonblock); } -static int io_write(struct io_kiocb *req, bool force_nonblock) +static int io_write(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t ret, io_size; + ssize_t ret, ret2, io_size; + unsigned long nr_segs; ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - req->result = 0; - io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; - - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; @@ -2752,59 +3164,53 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - ssize_t ret2; - - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; - - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; + if (unlikely(ret)) + goto out_free; - if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (req->flags & REQ_F_ISREG) { + __sb_start_write(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, &iter); + else + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); - /* - * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); - } else { + /* + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; + if (!force_nonblock || ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) - goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; - } + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + return -EAGAIN; } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } @@ -2870,10 +3276,9 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -2907,25 +3312,23 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } /* * IORING_OP_NOP just posts a completion event, nothing else. */ -static int io_nop(struct io_kiocb *req) +static int io_nop(struct io_kiocb *req, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_cqring_add_event(req, 0); - io_put_req(req); + __io_req_complete(req, 0, 0, cs); return 0; } @@ -2964,8 +3367,7 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock) req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -2980,7 +3382,6 @@ static int io_fallocate_prep(struct io_kiocb *req, req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->addr); req->sync.mode = READ_ONCE(sqe->len); - req->fsize = rlimit(RLIMIT_FSIZE); return 0; } @@ -2991,15 +3392,11 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) /* fallocate always requiring blocking context */ if (force_nonblock) return -EAGAIN; - - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3095,8 +3492,7 @@ err: req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3150,7 +3546,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return i; } -static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3169,8 +3566,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3228,7 +3624,8 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) return i ? i : -ENOMEM; } -static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3257,8 +3654,7 @@ out: io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3289,7 +3685,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #endif } -static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) +static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { #if defined(CONFIG_EPOLL) struct io_epoll *ie = &req->epoll; @@ -3301,8 +3698,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; #else return -EOPNOTSUPP; @@ -3338,8 +3734,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) ret = do_madvise(ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; #else return -EOPNOTSUPP; @@ -3378,8 +3773,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3418,8 +3812,7 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3450,7 +3843,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_close(struct io_kiocb *req, bool force_nonblock) +static int io_close(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_close *close = &req->close; int ret; @@ -3464,8 +3858,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) /* if the file has a flush method, be safe and punt to async */ if (close->put_file->f_op->flush && force_nonblock) { + /* was never set, but play safe */ + req->flags &= ~REQ_F_NOWAIT; /* avoid grabbing files - we don't need the files */ - req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT; + req->flags |= REQ_F_NO_FILE_TABLE; return -EAGAIN; } @@ -3473,10 +3869,9 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) ret = filp_close(close->put_file, req->work.files); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); fput(close->put_file); close->put_file = NULL; - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3510,8 +3905,7 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) req->sync.flags); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3531,6 +3925,15 @@ static int io_setup_async_msg(struct io_kiocb *req, return -EAGAIN; } +static int io_sendmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + iomsg->iov = iomsg->fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, + req->sr_msg.msg_flags, &iomsg->iov); +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; @@ -3541,7 +3944,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); #ifdef CONFIG_COMPAT @@ -3555,136 +3958,126 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_NEED_CLEANUP) return 0; - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, - &io->msg.iov); + ret = io_sendmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_async_ctx io; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - struct io_sr_msg *sr = &req->sr_msg; - - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; + if (unlikely(!sock)) + return ret; - io.msg.iov = io.msg.fast_iov; - ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, - sr->msg_flags, &io.msg.iov); - if (ret) - return ret; - } + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_sendmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct iovec iov; - unsigned flags; + if (unlikely(!sock)) + return ret; - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, - &msg.msg_iter); - if (ret) - return ret; + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret;; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = &req->sr_msg; struct iovec __user *uiov; size_t iov_len; int ret; - ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr, - &uiov, &iov_len); + ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, + &iomsg->uaddr, &uiov, &iov_len); if (ret) return ret; if (req->flags & REQ_F_BUFFER_SELECT) { if (iov_len > 1) return -EINVAL; - if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov))) + if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov))) return -EFAULT; - sr->len = io->msg.iov[0].iov_len; - iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1, + sr->len = iomsg->iov[0].iov_len; + iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1, sr->len); - io->msg.iov = NULL; + iomsg->iov = NULL; } else { ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &io->msg.iov, &io->msg.msg.msg_iter); + &iomsg->iov, &iomsg->msg.msg_iter); if (ret > 0) ret = 0; } @@ -3694,7 +4087,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) #ifdef CONFIG_COMPAT static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_ctx *io) + struct io_async_msghdr *iomsg) { struct compat_msghdr __user *msg_compat; struct io_sr_msg *sr = &req->sr_msg; @@ -3703,8 +4096,8 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, compat_size_t len; int ret; - msg_compat = (struct compat_msghdr __user *) sr->msg; - ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, + msg_compat = (struct compat_msghdr __user *) sr->umsg; + ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr, &ptr, &len); if (ret) return ret; @@ -3721,12 +4114,12 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, return -EFAULT; if (clen < 0) return -EINVAL; - sr->len = io->msg.iov[0].iov_len; - io->msg.iov = NULL; + sr->len = iomsg->iov[0].iov_len; + iomsg->iov = NULL; } else { ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, - &io->msg.iov, - &io->msg.msg.msg_iter); + &iomsg->iov, + &iomsg->msg.msg_iter); if (ret < 0) return ret; } @@ -3735,40 +4128,40 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, } #endif -static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + iomsg->iov = iomsg->fast_iov; #ifdef CONFIG_COMPAT if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, io); + return __io_compat_recvmsg_copy_hdr(req, iomsg); #endif - return __io_recvmsg_copy_hdr(req, io); + return __io_recvmsg_copy_hdr(req, iomsg); } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - int *cflags, bool needs_lock) + bool needs_lock) { struct io_sr_msg *sr = &req->sr_msg; struct io_buffer *kbuf; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return NULL; - kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); if (IS_ERR(kbuf)) return kbuf; sr->kbuf = kbuf; req->flags |= REQ_F_BUFFER_SELECTED; - - *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - *cflags |= IORING_CQE_F_BUFFER; return kbuf; } +static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) +{ + return io_put_kbuf(req, req->sr_msg.kbuf); +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -3780,7 +4173,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); @@ -3795,133 +4188,123 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->flags & REQ_F_NEED_CLEANUP) return 0; - ret = io_recvmsg_copy_hdr(req, io); + ret = io_recvmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; + struct io_buffer *kbuf; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_buffer *kbuf; - struct io_async_ctx io; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; + if (unlikely(!sock)) + return ret; - ret = io_recvmsg_copy_hdr(req, &io); - if (ret) - return ret; - } + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_recvmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) { + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, !force_nonblock); + if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - } else if (kbuf) { - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, - 1, req->sr_msg.len); - } + kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, + 1, req->sr_msg.len); + } - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, - kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - ret = io_setup_async_msg(req, kmsg); - if (ret != -EAGAIN) - kfree(kbuf); - return ret; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (kbuf) - kfree(kbuf); - } + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, + kmsg->uaddr, flags); + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags, cs); return 0; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { - struct io_buffer *kbuf = NULL; + struct io_buffer *kbuf; + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + void __user *buf = sr->buf; struct socket *sock; + struct iovec iov; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - void __user *buf = sr->buf; - struct msghdr msg; - struct iovec iov; - unsigned flags; + if (unlikely(!sock)) + return ret; - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, !force_nonblock); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - else if (kbuf) - buf = u64_to_user_ptr(kbuf->addr); + buf = u64_to_user_ptr(kbuf->addr); + } - ret = import_single_range(READ, buf, sr->len, &iov, - &msg.msg_iter); - if (ret) { - kfree(kbuf); - return ret; - } + ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + goto out_free; - req->flags |= REQ_F_NEED_CLEANUP; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_iocb = NULL; - msg.msg_flags = 0; - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = sock_recvmsg(sock, &msg, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; - kfree(kbuf); - req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = sock_recvmsg(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; +out_free: + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags, cs); return 0; } @@ -3941,7 +4324,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_accept *accept = &req->accept; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -3960,8 +4344,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; req_set_fail_links(req); } - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3985,7 +4368,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) &io->connect.address); } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_ctx __io, *io; unsigned file_flags; @@ -4021,8 +4405,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock) out: if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } #else /* !CONFIG_NET */ @@ -4031,12 +4414,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4047,12 +4432,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EOPNOTSUPP; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4062,7 +4449,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4072,7 +4460,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4084,33 +4473,9 @@ struct io_poll_table { int error; }; -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) -{ - struct task_struct *tsk = req->task; - struct io_ring_ctx *ctx = req->ctx; - int ret, notify = TWA_RESUME; - - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. - * If we're not using an eventfd, then TWA_RESUME is always fine, - * as we won't have dependencies between request completions for - * other kernel wait conditions. - */ - if (ctx->flags & IORING_SETUP_SQPOLL) - notify = 0; - else if (ctx->cq_ev_fd) - notify = TWA_SIGNAL; - - ret = task_work_add(tsk, cb, notify); - if (!ret) - wake_up_process(tsk); - return ret; -} - static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { - struct task_struct *tsk; int ret; /* for instances that support it check for an event match first: */ @@ -4121,7 +4486,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, list_del_init(&poll->wait.entry); - tsk = req->task; req->result = mask; init_task_work(&req->task_work, func); /* @@ -4132,6 +4496,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, */ ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); task_work_add(tsk, &req->task_work, 0); @@ -4200,7 +4566,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); req->flags |= REQ_F_COMP_LOCKED; - io_put_req_find_next(req, nxt); + *nxt = io_put_req_find_next(req); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -4212,13 +4578,8 @@ static void io_poll_task_func(struct callback_head *cb) struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); - if (nxt) { - struct io_ring_ctx *ctx = nxt->ctx; - - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(nxt, NULL); - mutex_unlock(&ctx->uring_lock); - } + if (nxt) + __io_req_task_submit(nxt); } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, @@ -4288,7 +4649,11 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = 0; poll->head = head; - add_wait_queue(head, &poll->wait); + + if (poll->events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(head, &poll->wait); + else + add_wait_queue(head, &poll->wait); } static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, @@ -4300,34 +4665,11 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } - - return 0; -} - static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; - bool canceled = false; trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); @@ -4337,40 +4679,19 @@ static void io_async_task_func(struct callback_head *cb) } /* If req is still hashed, it cannot have been canceled. Don't check. */ - if (hash_hashed(&req->hash_node)) { + if (hash_hashed(&req->hash_node)) hash_del(&req->hash_node); - } else { - canceled = READ_ONCE(apoll->poll.canceled); - if (canceled) { - io_cqring_fill_event(req, -ECANCELED); - io_commit_cqring(ctx); - } - } io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - /* restore ->work in case we need to retry again */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); + if (!READ_ONCE(apoll->poll.canceled)) + __io_req_task_submit(req); + else + __io_req_task_cancel(req, -ECANCELED); + kfree(apoll->double_poll); kfree(apoll); - - if (!canceled) { - __set_current_state(TASK_RUNNING); - if (io_sq_thread_acquire_mm(ctx, req)) { - io_cqring_add_event(req, -EFAULT); - goto end_req; - } - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL); - mutex_unlock(&ctx->uring_lock); - } else { - io_cqring_ev_posted(ctx); -end_req: - req_set_fail_links(req); - io_double_put_req(req); - } } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -4403,8 +4724,8 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, struct io_ring_ctx *ctx = req->ctx; bool cancel = false; - poll->file = req->file; io_init_poll_iocb(poll, mask, wake_func); + poll->file = req->file; poll->wait.private = req; ipt->pt._key = mask; @@ -4444,7 +4765,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (!req->file || !file_can_poll(req->file)) return false; - if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) + if (req->flags & REQ_F_POLLED) return false; if (!def->pollin && !def->pollout) return false; @@ -4455,9 +4776,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&apoll->work, &req->work, sizeof(req->work)); - io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4476,8 +4794,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (ret) { io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); return false; @@ -4520,14 +4836,6 @@ static bool io_poll_remove_one(struct io_kiocb *req) do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { io_put_req(req); - /* - * restore ->work because we will call - * io_req_work_drop_env below when dropping the - * final reference. - */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, - sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); } @@ -4608,10 +4916,9 @@ static int io_poll_remove(struct io_kiocb *req) ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -4635,7 +4942,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - u16 events; + u32 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4644,8 +4951,12 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (!poll->file) return -EBADF; - events = READ_ONCE(sqe->poll_events); - poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | + (events & EPOLLEXCLUSIVE); io_get_req_task(req); return 0; @@ -4659,7 +4970,6 @@ static int io_poll_add(struct io_kiocb *req) __poll_t mask; INIT_HLIST_NODE(&req->hash_node); - INIT_LIST_HEAD(&req->list); ipt.pt._qproc = io_poll_queue_proc; mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, @@ -4686,15 +4996,16 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - atomic_inc(&ctx->cq_timeouts); - spin_lock_irqsave(&ctx->completion_lock, flags); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + /* * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. */ - if (!list_empty(&req->list)) - list_del_init(&req->list); + if (!list_empty(&req->timeout.list)) + list_del_init(&req->timeout.list); io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); @@ -4711,9 +5022,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) struct io_kiocb *req; int ret = -ENOENT; - list_for_each_entry(req, &ctx->timeout_list, list) { + list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (user_data == req->user_data) { - list_del_init(&req->list); + list_del_init(&req->timeout.list); ret = 0; break; } @@ -4795,7 +5106,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = &req->io->timeout; data->req = req; - req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; @@ -4823,8 +5133,7 @@ static int io_timeout(struct io_kiocb *req) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - if (!off) { - req->flags |= REQ_F_TIMEOUT_NOSEQ; + if (io_is_timeout_noseq(req)) { entry = ctx->timeout_list.prev; goto add; } @@ -4837,16 +5146,17 @@ static int io_timeout(struct io_kiocb *req) * the one we need first. */ list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, + timeout.list); - if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(nxt)) continue; /* nxt.seq is behind @tail, otherwise would've been completed */ if (off >= nxt->timeout.target_seq - tail) break; } add: - list_add(&req->list, entry); + list_add(&req->timeout.list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); @@ -4950,7 +5260,8 @@ static int io_files_update_prep(struct io_kiocb *req, return 0; } -static int io_files_update(struct io_kiocb *req, bool force_nonblock) +static int io_files_update(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_files_update up; @@ -4968,8 +5279,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -4981,15 +5291,11 @@ static int io_req_defer_prep(struct io_kiocb *req, if (!sqe) return 0; - io_req_init_async(req); - - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } - - io_req_work_grab_env(req, &io_op_defs[req->opcode]); + if (io_alloc_async_ctx(req)) + return -EAGAIN; + ret = io_prep_work_files(req); + if (unlikely(ret)) + return ret; switch (req->opcode) { case IORING_OP_NOP: @@ -5091,86 +5397,117 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } +static u32 io_get_sequence(struct io_kiocb *req) +{ + struct io_kiocb *pos; + struct io_ring_ctx *ctx = req->ctx; + u32 total_submitted, nr_reqs = 1; + + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(pos, &req->link_list, link_list) + nr_reqs++; + + total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; + return total_submitted - nr_reqs; +} + static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; + struct io_defer_entry *de; int ret; + u32 seq; /* Still need defer if there is pending req in defer list. */ - if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list)) + if (likely(list_empty_careful(&ctx->defer_list) && + !(req->flags & REQ_F_IO_DRAIN))) + return 0; + + seq = io_get_sequence(req); + /* Still a chance to pass the sequence check */ + if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; if (!req->io) { - if (io_alloc_async_ctx(req)) - return -EAGAIN; ret = io_req_defer_prep(req, sqe); - if (ret < 0) + if (ret) return ret; } + io_prep_async_link(req); + de = kmalloc(sizeof(*de), GFP_KERNEL); + if (!de) + return -ENOMEM; spin_lock_irq(&ctx->completion_lock); - if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { + if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); - return 0; + kfree(de); + io_queue_async_work(req); + return -EIOCBQUEUED; } trace_io_uring_defer(ctx, req, req->user_data); - list_add_tail(&req->list, &ctx->defer_list); + de->req = req; + de->seq = seq; + list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; } -static void io_cleanup_req(struct io_kiocb *req) +static void __io_clean_op(struct io_kiocb *req) { struct io_async_ctx *io = req->io; - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - if (req->flags & REQ_F_BUFFER_SELECTED) + if (req->flags & REQ_F_BUFFER_SELECTED) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: kfree((void *)(unsigned long)req->rw.addr); - /* fallthrough */ - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - if (io->rw.iov != io->rw.fast_iov) - kfree(io->rw.iov); - break; - case IORING_OP_RECVMSG: - if (req->flags & REQ_F_BUFFER_SELECTED) - kfree(req->sr_msg.kbuf); - /* fallthrough */ - case IORING_OP_SENDMSG: - if (io->msg.iov != io->msg.fast_iov) - kfree(io->msg.iov); - break; - case IORING_OP_RECV: - if (req->flags & REQ_F_BUFFER_SELECTED) + break; + case IORING_OP_RECVMSG: + case IORING_OP_RECV: kfree(req->sr_msg.kbuf); - break; - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - break; - case IORING_OP_SPLICE: - case IORING_OP_TEE: - io_put_file(req, req->splice.file_in, - (req->splice.flags & SPLICE_F_FD_IN_FIXED)); - break; + break; + } + req->flags &= ~REQ_F_BUFFER_SELECTED; + } + + if (req->flags & REQ_F_NEED_CLEANUP) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + if (io->rw.iov != io->rw.fast_iov) + kfree(io->rw.iov); + break; + case IORING_OP_RECVMSG: + case IORING_OP_SENDMSG: + if (io->msg.iov != io->msg.fast_iov) + kfree(io->msg.iov); + break; + case IORING_OP_SPLICE: + case IORING_OP_TEE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; + } + req->flags &= ~REQ_F_NEED_CLEANUP; } - - req->flags &= ~REQ_F_NEED_CLEANUP; } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) + bool force_nonblock, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; switch (req->opcode) { case IORING_OP_NOP: - ret = io_nop(req); + ret = io_nop(req, cs); break; case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -5180,7 +5517,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_read(req, force_nonblock); + ret = io_read(req, force_nonblock, cs); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: @@ -5190,7 +5527,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_write(req, force_nonblock); + ret = io_write(req, force_nonblock, cs); break; case IORING_OP_FSYNC: if (sqe) { @@ -5232,9 +5569,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, force_nonblock); + ret = io_sendmsg(req, force_nonblock, cs); else - ret = io_send(req, force_nonblock); + ret = io_send(req, force_nonblock, cs); break; case IORING_OP_RECVMSG: case IORING_OP_RECV: @@ -5244,9 +5581,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, force_nonblock); + ret = io_recvmsg(req, force_nonblock, cs); else - ret = io_recv(req, force_nonblock); + ret = io_recv(req, force_nonblock, cs); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -5270,7 +5607,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_accept(req, force_nonblock); + ret = io_accept(req, force_nonblock, cs); break; case IORING_OP_CONNECT: if (sqe) { @@ -5278,7 +5615,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_connect(req, force_nonblock); + ret = io_connect(req, force_nonblock, cs); break; case IORING_OP_ASYNC_CANCEL: if (sqe) { @@ -5310,7 +5647,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_close(req, force_nonblock); + ret = io_close(req, force_nonblock, cs); break; case IORING_OP_FILES_UPDATE: if (sqe) { @@ -5318,7 +5655,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_files_update(req, force_nonblock); + ret = io_files_update(req, force_nonblock, cs); break; case IORING_OP_STATX: if (sqe) { @@ -5358,7 +5695,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_epoll_ctl(req, force_nonblock); + ret = io_epoll_ctl(req, force_nonblock, cs); break; case IORING_OP_SPLICE: if (sqe) { @@ -5374,7 +5711,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_provide_buffers(req, force_nonblock); + ret = io_provide_buffers(req, force_nonblock, cs); break; case IORING_OP_REMOVE_BUFFERS: if (sqe) { @@ -5382,7 +5719,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_remove_buffers(req, force_nonblock); + ret = io_remove_buffers(req, force_nonblock, cs); break; case IORING_OP_TEE: if (sqe) { @@ -5417,25 +5754,15 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_arm_async_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link; - - /* link head's timeout is queued in io_queue_async_work() */ - if (!(req->flags & REQ_F_QUEUE_TIMEOUT)) - return; - - link = list_first_entry(&req->link_list, struct io_kiocb, link_list); - io_queue_linked_timeout(link); -} - -static void io_wq_submit_work(struct io_wq_work **workptr) +static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) { - struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_kiocb *timeout; int ret = 0; - io_arm_async_linked_timeout(req); + timeout = io_prep_linked_timeout(req); + if (timeout) + io_queue_linked_timeout(timeout); /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == @@ -5445,7 +5772,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (!ret) { do { - ret = io_issue_sqe(req, NULL, false); + ret = io_issue_sqe(req, NULL, false, NULL); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -5459,11 +5786,10 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (ret) { req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); } - io_steal_work(req, workptr); + return io_steal_work(req); } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -5520,6 +5846,8 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + io_req_init_async(req); + if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) return 0; if (!ctx->ring_file) @@ -5545,6 +5873,13 @@ static int io_grab_files(struct io_kiocb *req) return ret; } +static inline int io_prep_work_files(struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].file_table) + return 0; + return io_grab_files(req); +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -5577,8 +5912,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req(prev); } else { - io_cqring_add_event(req, -ETIME); - io_put_req(req); + io_req_complete(req, -ETIME); } return HRTIMER_NORESTART; } @@ -5611,8 +5945,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK_HEAD)) return NULL; - /* for polled retry, if flag is set, we already went through here */ - if (req->flags & REQ_F_POLLED) + if (req->flags & REQ_F_LINK_TIMEOUT) return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, @@ -5624,7 +5957,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt; @@ -5644,54 +5978,45 @@ again: old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, true); + ret = io_issue_sqe(req, sqe, true, cs); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || - (req->flags & REQ_F_MUST_PUNT))) { - if (io_arm_poll_handler(req)) { - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - goto exit; - } + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + if (!io_arm_poll_handler(req)) { punt: - io_req_init_async(req); - - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (ret) + ret = io_prep_work_files(req); + if (unlikely(ret)) goto err; + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); } - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); goto exit; } + if (unlikely(ret)) { err: - nxt = NULL; - /* drop submission reference */ - io_put_req_find_next(req, &nxt); - - if (linked_timeout) { - if (!ret) - io_queue_linked_timeout(linked_timeout); - else - io_put_req(linked_timeout); - } - - /* and drop final reference, if we failed */ - if (ret) { - io_cqring_add_event(req, ret); + /* un-prep timeout, so it'll be killed as any other linked */ + req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); io_put_req(req); + io_req_complete(req, ret); + goto exit; } + + /* drop submission reference */ + nxt = io_put_req_find_next(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); + if (nxt) { req = nxt; @@ -5704,7 +6029,8 @@ exit: revert_creds(old_creds); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { int ret; @@ -5712,17 +6038,14 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) { if (ret != -EIOCBQUEUED) { fail_req: - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { if (!req->io) { - ret = -EAGAIN; - if (io_alloc_async_ctx(req)) - goto fail_req; ret = io_req_defer_prep(req, sqe); - if (unlikely(ret < 0)) + if (unlikely(ret)) goto fail_req; } @@ -5734,21 +6057,22 @@ fail_req: req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { - __io_queue_sqe(req, sqe); + __io_queue_sqe(req, sqe, cs); } } -static inline void io_queue_link_head(struct io_kiocb *req) +static inline void io_queue_link_head(struct io_kiocb *req, + struct io_comp_state *cs) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - io_cqring_add_event(req, -ECANCELED); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, -ECANCELED); } else - io_queue_sqe(req, NULL); + io_queue_sqe(req, NULL, cs); } static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **link) + struct io_kiocb **link, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5774,21 +6098,19 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) { + if (unlikely(ret)) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; return ret; } trace_io_uring_link(ctx, req, head); + io_get_req_task(req); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_link_head(head); + io_queue_link_head(head, cs); *link = NULL; } } else { @@ -5800,15 +6122,12 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_LINK_HEAD; INIT_LIST_HEAD(&req->link_list); - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) + if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; *link = req; } else { - io_queue_sqe(req, sqe); + io_queue_sqe(req, sqe, cs); } } @@ -5820,6 +6139,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, */ static void io_submit_state_end(struct io_submit_state *state) { + if (!list_empty(&state->comp.list)) + io_submit_flush_completions(&state->comp); blk_finish_plug(&state->plug); io_state_file_put(state); if (state->free_reqs) @@ -5830,9 +6151,15 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - unsigned int max_ios) + struct io_ring_ctx *ctx, unsigned int max_ios) { blk_start_plug(&state->plug); +#ifdef CONFIG_BLOCK + state->plug.nowait = true; +#endif + state->comp.nr = 0; + INIT_LIST_HEAD(&state->comp.list); + state->comp.ctx = ctx; state->free_reqs = 0; state->file = NULL; state->ios_left = max_ios; @@ -5897,12 +6224,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags; int id; - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); req->io = NULL; @@ -5950,7 +6271,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, struct file *ring_file, int ring_fd) { - struct io_submit_state state, *statep = NULL; + struct io_submit_state state; struct io_kiocb *link = NULL; int i, submitted = 0; @@ -5967,10 +6288,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, nr); - statep = &state; - } + io_submit_state_start(&state, ctx, nr); ctx->ring_fd = ring_fd; ctx->ring_file = ring_file; @@ -5985,28 +6303,28 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, io_consume_sqe(ctx); break; } - req = io_alloc_req(ctx, statep); + req = io_alloc_req(ctx, &state); if (unlikely(!req)) { if (!submitted) submitted = -EAGAIN; break; } - err = io_init_req(ctx, req, sqe, statep); + err = io_init_req(ctx, req, sqe, &state); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; if (unlikely(err)) { fail_req: - io_cqring_add_event(req, err); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, err); break; } trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, io_async_submit(ctx)); - err = io_submit_sqe(req, sqe, &link); + err = io_submit_sqe(req, sqe, &link, &state.comp); if (err) goto fail_req; } @@ -6017,9 +6335,8 @@ fail_req: percpu_ref_put_many(&ctx->refs, nr - ref_used); } if (link) - io_queue_link_head(link); - if (statep) - io_submit_state_end(&state); + io_queue_link_head(link, &state.comp); + io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -6027,6 +6344,21 @@ fail_req: return submitted; } +static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) +{ + /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + +static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; @@ -6043,12 +6375,12 @@ static int io_sq_thread(void *data) while (!kthread_should_park()) { unsigned int to_submit; - if (!list_empty(&ctx->poll_list)) { + if (!list_empty(&ctx->iopoll_list)) { unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list)) - io_iopoll_getevents(ctx, &nr_events, 0); + if (!list_empty(&ctx->iopoll_list) && !need_resched()) + io_do_iopoll(ctx, &nr_events, 0); else timeout = jiffies + ctx->sq_thread_idle; mutex_unlock(&ctx->uring_lock); @@ -6067,7 +6399,7 @@ static int io_sq_thread(void *data) * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); /* * We're polling. If we're within the defined idle @@ -6076,11 +6408,10 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (!list_empty(&ctx->poll_list) || need_resched() || + if (!list_empty(&ctx->iopoll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { - if (current->task_works) - task_work_run(); + io_run_task_work(); cond_resched(); continue; } @@ -6090,21 +6421,18 @@ static int io_sq_thread(void *data) /* * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to poll_list, it - * is because reqs may have been punted to io worker and - * will be added to poll_list later, hence check the - * poll_list again. + * to check if there are new reqs added to iopoll_list, + * it is because reqs may have been punted to io worker + * and will be added to iopoll_list later, hence check + * the iopoll_list again. */ if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->poll_list)) { + !list_empty_careful(&ctx->iopoll_list)) { finish_wait(&ctx->sqo_wait, &wait); continue; } - /* Tell userspace we may need a wakeup call */ - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_set_wakeup_flag(ctx); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -6112,9 +6440,9 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } - if (current->task_works) { - task_work_run(); + if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); + io_ring_clear_wakeup_flag(ctx); continue; } if (signal_pending(current)) @@ -6122,17 +6450,13 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); } mutex_lock(&ctx->uring_lock); @@ -6142,10 +6466,9 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (current->task_works) - task_work_run(); + io_run_task_work(); - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); revert_creds(old_cred); kthread_parkme(); @@ -6208,9 +6531,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { if (io_cqring_events(ctx, false) >= min_events) return 0; - if (!current->task_works) + if (!io_run_task_work()) break; - task_work_run(); } while (1); if (sig) { @@ -6232,8 +6554,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (current->task_works) - task_work_run(); + if (io_run_task_work()) + continue; if (signal_pending(current)) { if (current->jobctl & JOBCTL_TASK_WORK) { spin_lock_irq(¤t->sighand->siglock); @@ -7019,17 +7341,21 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; + if (ctx->sqo_mm) { + mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } return ret; } -static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +static inline void __io_unaccount_mem(struct user_struct *user, + unsigned long nr_pages) { atomic_long_sub(nr_pages, &user->locked_vm); } -static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +static inline int __io_account_mem(struct user_struct *user, + unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -7047,6 +7373,41 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) +{ + if (ctx->limit_mem) + __io_unaccount_mem(ctx->user, nr_pages); + + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm -= nr_pages; + else if (acct == ACCT_PINNED) + atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + } +} + +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) +{ + int ret; + + if (ctx->limit_mem) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm += nr_pages; + else if (acct == ACCT_PINNED) + atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + } + + return 0; +} + static void io_mem_free(void *ptr) { struct page *page; @@ -7083,6 +7444,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return SIZE_MAX; #endif + if (sq_offset) + *sq_offset = off; + sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; @@ -7090,9 +7454,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; - if (sq_offset) - *sq_offset = off; - return off; } @@ -7121,8 +7482,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, imu->nr_bvecs); + io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7205,11 +7565,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - if (ctx->account_mem) { - ret = io_account_mem(ctx->user, nr_pages); - if (ret) - goto err; - } + ret = io_account_mem(ctx, nr_pages, ACCT_PINNED); + if (ret) + goto err; ret = 0; if (!pages || nr_pages > got_pages) { @@ -7222,8 +7580,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } got_pages = nr_pages; @@ -7233,8 +7590,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } @@ -7265,8 +7621,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); kvfree(imu->bvec); goto err; } @@ -7350,11 +7705,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); - if (ctx->sqo_mm) + io_sqe_buffer_unregister(ctx); + if (ctx->sqo_mm) { mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } - io_iopoll_reap_events(ctx); - io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); @@ -7418,11 +7774,8 @@ static int io_remove_personalities(int id, void *p, void *data) static void io_ring_exit_work(struct work_struct *work) { - struct io_ring_ctx *ctx; - - ctx = container_of(work, struct io_ring_ctx, exit_work); - if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + exit_work); /* * If we're doing polled IO and end up having requests being @@ -7430,11 +7783,11 @@ static void io_ring_exit_work(struct work_struct *work) * we're waiting for refs to drop. We need to reap these manually, * as nobody else will be looking for them. */ - while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { - io_iopoll_reap_events(ctx); + do { if (ctx->rings) io_cqring_overflow_flush(ctx, true); - } + io_iopoll_try_reap_events(ctx); + } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); } @@ -7450,10 +7803,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); - io_iopoll_reap_events(ctx); /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); /* @@ -7461,9 +7814,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) * is closed but resources aren't reaped yet. This can cause * spurious failure in setting up a new ring. */ - if (ctx->account_mem) - io_unaccount_mem(ctx->user, - ring_pages(ctx->sq_entries, ctx->cq_entries)); + io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries), + ACCT_LOCKED); INIT_WORK(&ctx->exit_work, io_ring_exit_work); queue_work(system_wq, &ctx->exit_work); @@ -7519,17 +7871,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (cancel_req->flags & REQ_F_OVERFLOW) { spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->list); + list_del(&cancel_req->compl.list); cancel_req->flags &= ~REQ_F_OVERFLOW; - if (list_empty(&ctx->cq_overflow_list)) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } - spin_unlock_irq(&ctx->completion_lock); + io_cqring_mark_overflow(ctx); WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); /* * Put inflight ref and overflow ref. If that's @@ -7652,8 +8001,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; - if (current->task_works) - task_work_run(); + io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; @@ -7692,8 +8040,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - unsigned nr_events = 0; - min_complete = min(min_complete, ctx->cq_entries); /* @@ -7704,7 +8050,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_iopoll_check(ctx, &nr_events, min_complete); + ret = io_iopoll_check(ctx, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } @@ -7909,7 +8255,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, { struct user_struct *user = NULL; struct io_ring_ctx *ctx; - bool account_mem; + bool limit_mem; int ret; if (!entries) @@ -7948,10 +8294,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, } user = get_uid(current_user()); - account_mem = !capable(CAP_IPC_LOCK); + limit_mem = !capable(CAP_IPC_LOCK); - if (account_mem) { - ret = io_account_mem(user, + if (limit_mem) { + ret = __io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { free_uid(user); @@ -7961,14 +8307,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) { - if (account_mem) - io_unaccount_mem(user, ring_pages(p->sq_entries, + if (limit_mem) + __io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; } ctx->compat = in_compat_syscall(); - ctx->account_mem = account_mem; ctx->user = user; ctx->creds = get_current_cred(); @@ -8000,12 +8345,22 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | + IORING_FEAT_POLL_32BITS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; goto err; } + + /* + * Account memory _before_ installing the file descriptor. Once + * the descriptor is installed, it can get closed at any time. + */ + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); + ctx->limit_mem = limit_mem; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup @@ -8289,7 +8644,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index d634561f871a..78f5c96c76f3 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -612,9 +612,6 @@ static bool rootdir_empty(struct super_block *sb, unsigned long block) /* * Initialize the superblock and read the root inode. - * - * Note: a check_disk_change() has been done immediately prior - * to this call, so we don't need to check again. */ static int isofs_fill_super(struct super_block *s, void *data, int silent) { diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index eb8b9e233d73..2935d4c776ec 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -36,6 +36,7 @@ #include <linux/fs.h> #include <linux/buffer_head.h> +#include <linux/blkdev.h> #include "jfs_incore.h" #include "jfs_filsys.h" diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 66acea9d878b..bde787c354fc 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -6,6 +6,7 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_metapage.h" diff --git a/fs/locks.c b/fs/locks.c index 7df0f9fa66f4..938fe325bc54 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1282,6 +1282,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, if (!new_fl) goto out; locks_copy_lock(new_fl, request); + locks_move_blocks(new_fl, request); request = new_fl; new_fl = NULL; locks_insert_lock_ctx(request, &fl->fl_list); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 3c4811469ae8..a87d4391e6b5 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -8,6 +8,7 @@ #include <linux/buffer_head.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include "dir.h" #include "aops.h" diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 37d38697eaf8..837971e74109 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -3,6 +3,7 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/blkdev.h> static int devinfo_show(struct seq_file *f, void *v) { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 7b4bac91146b..bb02989d92b6 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -78,6 +78,7 @@ #include <linux/namei.h> #include <linux/capability.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #include "../internal.h" /* ugh */ #include <linux/uaccess.h> diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index ff336513c254..155b82870333 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -15,6 +15,7 @@ #include "reiserfs.h" #include <linux/init.h> #include <linux/proc_fs.h> +#include <linux/blkdev.h> /* * LOCKING: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 52de29000c7e..6e264dded46e 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -339,7 +339,6 @@ out: return ret; } -/* Should pair with userfaultfd_signal_pending() */ static inline long userfaultfd_get_blocking_state(unsigned int flags) { if (flags & FAULT_FLAG_INTERRUPTIBLE) @@ -351,18 +350,6 @@ static inline long userfaultfd_get_blocking_state(unsigned int flags) return TASK_UNINTERRUPTIBLE; } -/* Should pair with userfaultfd_get_blocking_state() */ -static inline bool userfaultfd_signal_pending(unsigned int flags) -{ - if (flags & FAULT_FLAG_INTERRUPTIBLE) - return signal_pending(current); - - if (flags & FAULT_FLAG_KILLABLE) - return fatal_signal_pending(current); - - return false; -} - /* * The locking rules involved in returning VM_FAULT_RETRY depending on * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and @@ -516,33 +503,9 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) vmf->flags, reason); mmap_read_unlock(mm); - if (likely(must_wait && !READ_ONCE(ctx->released) && - !userfaultfd_signal_pending(vmf->flags))) { + if (likely(must_wait && !READ_ONCE(ctx->released))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); schedule(); - ret |= VM_FAULT_MAJOR; - - /* - * False wakeups can orginate even from rwsem before - * up_read() however userfaults will wait either for a - * targeted wakeup on the specific uwq waitqueue from - * wake_userfault() or for signals or for uffd - * release. - */ - while (!READ_ONCE(uwq.waken)) { - /* - * This needs the full smp_store_mb() - * guarantee as the state write must be - * visible to other CPUs before reading - * uwq.waken from other CPUs. - */ - set_current_state(blocking_state); - if (READ_ONCE(uwq.waken) || - READ_ONCE(ctx->released) || - userfaultfd_signal_pending(vmf->flags)) - break; - schedule(); - } } __set_current_state(TASK_RUNNING); diff --git a/fs/verity/open.c b/fs/verity/open.c index d007db0c9304..bfe0280c14e4 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -221,11 +221,20 @@ out: void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple processes may race to set ->i_verity_info, so use cmpxchg. - * This pairs with the READ_ONCE() in fsverity_get_info(). + * Multiple tasks may race to set ->i_verity_info, so use + * cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., here we publish ->i_verity_info with a + * RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL) + if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { + /* Lost the race, so free the fsverity_info we allocated. */ fsverity_free_info(vi); + /* + * Afterwards, the caller may access ->i_verity_info directly, + * so make sure to ACQUIRE the winning fsverity_info. + */ + (void)fsverity_get_info(inode); + } } void fsverity_free_info(struct fsverity_info *vi) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 00db81eac80d..fdbff4860d61 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1080,7 +1080,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return 0; } diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c index 4bcc3e61056c..b03333f1c84a 100644 --- a/fs/xfs/xfs_pwork.c +++ b/fs/xfs/xfs_pwork.c @@ -132,5 +132,5 @@ xfs_pwork_guess_datadev_parallelism( * For now we'll go with the most conservative setting possible, * which is two threads for an SSD and 1 thread everywhere else. */ - return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1; + return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1; } |