From 609805d809733d0c669f21f710bdac308cc63cba Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 30 May 2017 05:29:09 +0100 Subject: Btrfs: fix invalid extent maps due to hole punching While punching a hole in a range that is not aligned with the sector size (currently the same as the page size) we can end up leaving an extent map in memory with a length that is smaller then the sector size or with a start offset that is not aligned to the sector size. Both cases are not expected and can lead to problems. This issue is easily detected after the patch from commit a7e3b975a0f9 ("Btrfs: fix reported number of inode blocks"), introduced in kernel 4.12-rc1, in a scenario like the following for example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ xfs_io -c "pwrite -S 0xaa -b 100K 0 100K" /mnt/foo $ xfs_io -c "fpunch 60K 90K" /mnt/foo $ xfs_io -c "pwrite -S 0xbb -b 100K 50K 100K" /mnt/foo $ xfs_io -c "pwrite -S 0xcc -b 50K 100K 50K" /mnt/foo $ umount /mnt After the unmount operation we can see several warnings emmitted due to underflows related to space reservation counters: [ 2837.443299] ------------[ cut here ]------------ [ 2837.447395] WARNING: CPU: 8 PID: 2474 at fs/btrfs/inode.c:9444 btrfs_destroy_inode+0xe8/0x27e [btrfs] [ 2837.452108] Modules linked in: dm_flakey dm_mod ppdev parport_pc psmouse parport sg pcspkr acpi_cpufreq tpm_tis tpm_tis_core i2c_piix4 i2c_core evdev tpm button se rio_raw sunrpc loop autofs4 ext4 crc16 jbd2 mbcache btrfs raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_gene ric raid1 raid0 multipath linear md_mod sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [ 2837.458389] CPU: 8 PID: 2474 Comm: umount Tainted: G W 4.10.0-rc8-btrfs-next-43+ #1 [ 2837.459754] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 [ 2837.462379] Call Trace: [ 2837.462379] dump_stack+0x68/0x92 [ 2837.462379] __warn+0xc2/0xdd [ 2837.462379] warn_slowpath_null+0x1d/0x1f [ 2837.462379] btrfs_destroy_inode+0xe8/0x27e [btrfs] [ 2837.462379] destroy_inode+0x3d/0x55 [ 2837.462379] evict+0x177/0x17e [ 2837.462379] dispose_list+0x50/0x71 [ 2837.462379] evict_inodes+0x132/0x141 [ 2837.462379] generic_shutdown_super+0x3f/0xeb [ 2837.462379] kill_anon_super+0x12/0x1c [ 2837.462379] btrfs_kill_super+0x16/0x21 [btrfs] [ 2837.462379] deactivate_locked_super+0x30/0x68 [ 2837.462379] deactivate_super+0x36/0x39 [ 2837.462379] cleanup_mnt+0x58/0x76 [ 2837.462379] __cleanup_mnt+0x12/0x14 [ 2837.462379] task_work_run+0x77/0x9b [ 2837.462379] prepare_exit_to_usermode+0x9d/0xc5 [ 2837.462379] syscall_return_slowpath+0x196/0x1b9 [ 2837.462379] entry_SYSCALL_64_fastpath+0xab/0xad [ 2837.462379] RIP: 0033:0x7f3ef3e6b9a7 [ 2837.462379] RSP: 002b:00007ffdd0d8de58 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 2837.462379] RAX: 0000000000000000 RBX: 0000556f76a39060 RCX: 00007f3ef3e6b9a7 [ 2837.462379] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000556f76a3f910 [ 2837.462379] RBP: 0000556f76a3f910 R08: 0000556f76a3e670 R09: 0000000000000015 [ 2837.462379] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007f3ef436ce64 [ 2837.462379] R13: 0000000000000000 R14: 0000556f76a39240 R15: 00007ffdd0d8e0e0 [ 2837.519355] ---[ end trace e79345fe24b30b8d ]--- [ 2837.596256] ------------[ cut here ]------------ [ 2837.597625] WARNING: CPU: 8 PID: 2474 at fs/btrfs/extent-tree.c:5699 btrfs_free_block_groups+0x246/0x3eb [btrfs] [ 2837.603547] Modules linked in: dm_flakey dm_mod ppdev parport_pc psmouse parport sg pcspkr acpi_cpufreq tpm_tis tpm_tis_core i2c_piix4 i2c_core evdev tpm button serio_raw sunrpc loop autofs4 ext4 crc16 jbd2 mbcache btrfs raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [ 2837.659372] CPU: 8 PID: 2474 Comm: umount Tainted: G W 4.10.0-rc8-btrfs-next-43+ #1 [ 2837.663359] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 [ 2837.663359] Call Trace: [ 2837.663359] dump_stack+0x68/0x92 [ 2837.663359] __warn+0xc2/0xdd [ 2837.663359] warn_slowpath_null+0x1d/0x1f [ 2837.663359] btrfs_free_block_groups+0x246/0x3eb [btrfs] [ 2837.663359] close_ctree+0x1dd/0x2e1 [btrfs] [ 2837.663359] ? evict_inodes+0x132/0x141 [ 2837.663359] btrfs_put_super+0x15/0x17 [btrfs] [ 2837.663359] generic_shutdown_super+0x6a/0xeb [ 2837.663359] kill_anon_super+0x12/0x1c [ 2837.663359] btrfs_kill_super+0x16/0x21 [btrfs] [ 2837.663359] deactivate_locked_super+0x30/0x68 [ 2837.663359] deactivate_super+0x36/0x39 [ 2837.663359] cleanup_mnt+0x58/0x76 [ 2837.663359] __cleanup_mnt+0x12/0x14 [ 2837.663359] task_work_run+0x77/0x9b [ 2837.663359] prepare_exit_to_usermode+0x9d/0xc5 [ 2837.663359] syscall_return_slowpath+0x196/0x1b9 [ 2837.663359] entry_SYSCALL_64_fastpath+0xab/0xad [ 2837.663359] RIP: 0033:0x7f3ef3e6b9a7 [ 2837.663359] RSP: 002b:00007ffdd0d8de58 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 2837.663359] RAX: 0000000000000000 RBX: 0000556f76a39060 RCX: 00007f3ef3e6b9a7 [ 2837.663359] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000556f76a3f910 [ 2837.663359] RBP: 0000556f76a3f910 R08: 0000556f76a3e670 R09: 0000000000000015 [ 2837.663359] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007f3ef436ce64 [ 2837.663359] R13: 0000000000000000 R14: 0000556f76a39240 R15: 00007ffdd0d8e0e0 [ 2837.739445] ---[ end trace e79345fe24b30b8e ]--- [ 2837.745595] ------------[ cut here ]------------ [ 2837.746412] WARNING: CPU: 8 PID: 2474 at fs/btrfs/extent-tree.c:5700 btrfs_free_block_groups+0x261/0x3eb [btrfs] [ 2837.747955] Modules linked in: dm_flakey dm_mod ppdev parport_pc psmouse parport sg pcspkr acpi_cpufreq tpm_tis tpm_tis_core i2c_piix4 i2c_core evdev tpm button serio_raw sunrpc loop autofs4 ext4 crc16 jbd2 mbcache btrfs raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [ 2837.755395] CPU: 8 PID: 2474 Comm: umount Tainted: G W 4.10.0-rc8-btrfs-next-43+ #1 [ 2837.756769] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 [ 2837.758526] Call Trace: [ 2837.758925] dump_stack+0x68/0x92 [ 2837.759383] __warn+0xc2/0xdd [ 2837.759383] warn_slowpath_null+0x1d/0x1f [ 2837.759383] btrfs_free_block_groups+0x261/0x3eb [btrfs] [ 2837.759383] close_ctree+0x1dd/0x2e1 [btrfs] [ 2837.759383] ? evict_inodes+0x132/0x141 [ 2837.759383] btrfs_put_super+0x15/0x17 [btrfs] [ 2837.759383] generic_shutdown_super+0x6a/0xeb [ 2837.759383] kill_anon_super+0x12/0x1c [ 2837.759383] btrfs_kill_super+0x16/0x21 [btrfs] [ 2837.759383] deactivate_locked_super+0x30/0x68 [ 2837.759383] deactivate_super+0x36/0x39 [ 2837.759383] cleanup_mnt+0x58/0x76 [ 2837.759383] __cleanup_mnt+0x12/0x14 [ 2837.759383] task_work_run+0x77/0x9b [ 2837.759383] prepare_exit_to_usermode+0x9d/0xc5 [ 2837.759383] syscall_return_slowpath+0x196/0x1b9 [ 2837.759383] entry_SYSCALL_64_fastpath+0xab/0xad [ 2837.759383] RIP: 0033:0x7f3ef3e6b9a7 [ 2837.759383] RSP: 002b:00007ffdd0d8de58 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 2837.759383] RAX: 0000000000000000 RBX: 0000556f76a39060 RCX: 00007f3ef3e6b9a7 [ 2837.759383] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000556f76a3f910 [ 2837.759383] RBP: 0000556f76a3f910 R08: 0000556f76a3e670 R09: 0000000000000015 [ 2837.759383] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007f3ef436ce64 [ 2837.759383] R13: 0000000000000000 R14: 0000556f76a39240 R15: 00007ffdd0d8e0e0 [ 2837.777063] ---[ end trace e79345fe24b30b8f ]--- [ 2837.778235] ------------[ cut here ]------------ [ 2837.778856] WARNING: CPU: 8 PID: 2474 at fs/btrfs/extent-tree.c:9825 btrfs_free_block_groups+0x348/0x3eb [btrfs] [ 2837.791385] Modules linked in: dm_flakey dm_mod ppdev parport_pc psmouse parport sg pcspkr acpi_cpufreq tpm_tis tpm_tis_core i2c_piix4 i2c_core evdev tpm button serio_raw sunrpc loop autofs4 ext4 crc16 jbd2 mbcache btrfs raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [ 2837.797711] CPU: 8 PID: 2474 Comm: umount Tainted: G W 4.10.0-rc8-btrfs-next-43+ #1 [ 2837.798594] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 [ 2837.800118] Call Trace: [ 2837.800515] dump_stack+0x68/0x92 [ 2837.801015] __warn+0xc2/0xdd [ 2837.801471] warn_slowpath_null+0x1d/0x1f [ 2837.801698] btrfs_free_block_groups+0x348/0x3eb [btrfs] [ 2837.801698] close_ctree+0x1dd/0x2e1 [btrfs] [ 2837.801698] ? evict_inodes+0x132/0x141 [ 2837.801698] btrfs_put_super+0x15/0x17 [btrfs] [ 2837.801698] generic_shutdown_super+0x6a/0xeb [ 2837.801698] kill_anon_super+0x12/0x1c [ 2837.801698] btrfs_kill_super+0x16/0x21 [btrfs] [ 2837.801698] deactivate_locked_super+0x30/0x68 [ 2837.801698] deactivate_super+0x36/0x39 [ 2837.801698] cleanup_mnt+0x58/0x76 [ 2837.801698] __cleanup_mnt+0x12/0x14 [ 2837.801698] task_work_run+0x77/0x9b [ 2837.801698] prepare_exit_to_usermode+0x9d/0xc5 [ 2837.801698] syscall_return_slowpath+0x196/0x1b9 [ 2837.801698] entry_SYSCALL_64_fastpath+0xab/0xad [ 2837.801698] RIP: 0033:0x7f3ef3e6b9a7 [ 2837.801698] RSP: 002b:00007ffdd0d8de58 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 2837.801698] RAX: 0000000000000000 RBX: 0000556f76a39060 RCX: 00007f3ef3e6b9a7 [ 2837.801698] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000556f76a3f910 [ 2837.801698] RBP: 0000556f76a3f910 R08: 0000556f76a3e670 R09: 0000000000000015 [ 2837.801698] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007f3ef436ce64 [ 2837.801698] R13: 0000000000000000 R14: 0000556f76a39240 R15: 00007ffdd0d8e0e0 [ 2837.818441] ---[ end trace e79345fe24b30b90 ]--- [ 2837.818991] BTRFS info (device sdc): space_info 1 has 7974912 free, is not full [ 2837.819830] BTRFS info (device sdc): space_info total=8388608, used=417792, pinned=0, reserved=0, may_use=18446744073709547520, readonly=0 What happens in the above example is the following: 1) When punching the hole, at btrfs_punch_hole(), the variable tail_len is set to 2048 (as tail_start is 148Kb + 1 and offset + len is 150Kb). This results in the creation of an extent map with a length of 2Kb starting at file offset 148Kb, through find_first_non_hole() -> btrfs_get_extent(). 2) The second write (first write after the hole punch operation), sets the range [50Kb, 152Kb[ to delalloc. 3) The third write, at btrfs_find_new_delalloc_bytes(), sees the extent map covering the range [148Kb, 150Kb[ and ends up calling set_extent_bit() for the same range, which results in splitting an existing extent state record, covering the range [148Kb, 152Kb[ into two 2Kb extent state records, covering the ranges [148Kb, 150Kb[ and [150Kb, 152Kb[. 4) Finally at lock_and_cleanup_extent_if_need(), immediately after calling btrfs_find_new_delalloc_bytes() we clear the delalloc bit from the range [100Kb, 152Kb[ which results in the btrfs_clear_bit_hook() callback being invoked against the two 2Kb extent state records that cover the ranges [148Kb, 150Kb[ and [150Kb, 152Kb[. When called against the first 2Kb extent state, it calls btrfs_delalloc_release_metadata() with a length argument of 2048 bytes. That function rounds up the length to a sector size aligned length, so it ends up considering a length of 4096 bytes, and then calls calc_csum_metadata_size() which results in decrementing the inode's csum_bytes counter by 4096 bytes, so after it stays a value of 0 bytes. Then the same happens when btrfs_clear_bit_hook() is called against the second extent state that has a length of 2Kb, covering the range [150Kb, 152Kb[, the length is rounded up to 4096 and calc_csum_metadata_size() ends up being called to decrement 4096 bytes from the inode's csum_bytes counter, which at that time has a value of 0, leading to an underflow, which is exactly what triggers the first warning, at btrfs_destroy_inode(). All the other warnings relate to several space accounting counters that underflow as well due to similar reasons. A similar case but where the hole punching operation creates an extent map with a start offset not aligned to the sector size is the following: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ xfs_io -f -c "fpunch 695K 820K" $SCRATCH_MNT/bar $ xfs_io -c "pwrite -S 0xaa 1008K 307K" $SCRATCH_MNT/bar $ xfs_io -c "pwrite -S 0xbb -b 630K 1073K 630K" $SCRATCH_MNT/bar $ xfs_io -c "pwrite -S 0xcc -b 459K 1068K 459K" $SCRATCH_MNT/bar $ umount /mnt During the unmount operation we get similar traces for the same reasons as in the first example. So fix the hole punching operation to make sure it never creates extent maps with a length that is not aligned to the sector size nor with a start offset that is not aligned to the sector size, as this breaks all assumptions and it's a land mine. Fixes: d77815461f04 ("btrfs: Avoid trucating page or punching hole in a already existed hole.") Cc: Signed-off-by: Filipe Manana Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index da1096eb1a40..5da85b080368 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2390,10 +2390,13 @@ out: */ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; int ret = 0; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + round_down(*start, fs_info->sectorsize), + round_up(*len, fs_info->sectorsize), 0); if (IS_ERR(em)) return PTR_ERR(em); -- cgit v1.2.3 From 364ecf3651e0862152c8b340d7cb3021dc0122c7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 27 Feb 2017 15:10:38 +0800 Subject: btrfs: qgroup: Introduce extent changeset for qgroup reserve functions Introduce a new parameter, struct extent_changeset for btrfs_qgroup_reserved_data() and its callers. Such extent_changeset was used in btrfs_qgroup_reserve_data() to record which range it reserved in current reserve, so it can free it in error paths. The reason we need to export it to callers is, at buffered write error path, without knowing what exactly which range we reserved in current allocation, we can free space which is not reserved by us. This will lead to qgroup reserved space underflow. Reviewed-by: Chandan Rajendra Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ++++-- fs/btrfs/extent-tree.c | 23 +++++++++++++---------- fs/btrfs/extent_io.h | 34 +++++++++++++++++++++++++++++++++ fs/btrfs/file.c | 12 +++++++++--- fs/btrfs/inode-map.c | 4 +++- fs/btrfs/inode.c | 18 ++++++++++++++---- fs/btrfs/ioctl.c | 5 ++++- fs/btrfs/qgroup.c | 51 ++++++++++++++++++++++++++++++++------------------ fs/btrfs/qgroup.h | 3 ++- fs/btrfs/relocation.c | 4 +++- 10 files changed, 119 insertions(+), 41 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e33e1d6d5c9..1ee4489dc398 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2708,8 +2708,9 @@ enum btrfs_flush_state { COMMIT_TRANS = 6, }; -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); @@ -2727,7 +2728,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b0b02c6c71aa..70f85cfdbd46 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3402,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 num_pages = 0; @@ -3521,7 +3522,7 @@ again: num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, 0, num_pages); + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); if (ret) goto out_put; @@ -3552,6 +3553,7 @@ out: block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); + extent_changeset_free(data_reserved); return ret; } @@ -4326,12 +4328,8 @@ commit_trans: return ret; } -/* - * New check_data_free_space() with ability for precious data reservation - * Will replace old btrfs_check_data_free_space(), but for patch split, - * add a new function first and then replace it. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; @@ -4346,9 +4344,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, start, len); + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; return ret; } @@ -6175,6 +6175,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * @inode: inode we're writing to * @start: start range we are writing to * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * @@ -6192,11 +6194,12 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * Return 0 for success * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ce670d213913..aeafdb35d90b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -215,6 +215,40 @@ struct extent_changeset { struct ulist range_changed; }; +static inline void extent_changeset_init(struct extent_changeset *changeset) +{ + changeset->bytes_changed = 0; + ulist_init(&changeset->range_changed); +} + +static inline struct extent_changeset *extent_changeset_alloc(void) +{ + struct extent_changeset *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + extent_changeset_init(ret); + return ret; +} + +static inline void extent_changeset_release(struct extent_changeset *changeset) +{ + if (!changeset) + return; + changeset->bytes_changed = 0; + ulist_release(&changeset->range_changed); +} + +static inline void extent_changeset_free(struct extent_changeset *changeset) +{ + if (!changeset) + return; + extent_changeset_release(changeset); + kfree(changeset); +} + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5da85b080368..1b5cce51728b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1581,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; @@ -1628,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); - ret = btrfs_check_data_free_space(inode, pos, write_bytes); + extent_changeset_release(data_reserved); + ret = btrfs_check_data_free_space(inode, &data_reserved, pos, + write_bytes); if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && @@ -1802,6 +1805,7 @@ again: } } + extent_changeset_free(data_reserved); return num_written ? num_written : ret; } @@ -2772,6 +2776,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; struct list_head reserve_list; @@ -2901,8 +2906,8 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, cur_offset, - last_byte - cur_offset); + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + cur_offset, last_byte - cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -2974,6 +2979,7 @@ out: if (ret != 0) btrfs_free_reserved_data_space(inode, alloc_start, alloc_end - cur_offset); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 5c6c20ec64d8..d02019747d00 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_path *path; struct inode *inode; struct btrfs_block_rsv *rsv; + struct extent_changeset *data_reserved = NULL; u64 num_bytes; u64 alloc_hint = 0; int ret; @@ -492,7 +493,7 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc); if (ret) goto out_put; @@ -516,6 +517,7 @@ out: trans->bytes_reserved = num_bytes; btrfs_free_path(path); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b66ea03a3a1c..d9cf6df40d5e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2037,6 +2037,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_writepage_fixup *fixup; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct page *page; struct inode *inode; u64 page_start; @@ -2074,7 +2075,7 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); @@ -2094,6 +2095,7 @@ out_page: unlock_page(page); put_page(page); kfree(fixup); + extent_changeset_free(data_reserved); } /* @@ -4769,6 +4771,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4783,7 +4786,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, round_down(from, blocksize), blocksize); if (ret) goto out; @@ -4868,6 +4871,7 @@ out_unlock: unlock_page(page); put_page(page); out: + extent_changeset_free(data_reserved); return ret; } @@ -8718,6 +8722,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_data dio_data = { 0 }; + struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; @@ -8754,7 +8759,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, offset, count); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); if (ret) goto out; dio_data.outstanding_extents = count_max_extents(count); @@ -8811,6 +8817,7 @@ out: if (relock) inode_lock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -9043,6 +9050,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; unsigned long zero_start; loff_t size; @@ -9068,7 +9076,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, reserved_space); if (!ret) { ret = file_update_time(vmf->vma->vm_file); @@ -9174,6 +9182,7 @@ again: out_unlock: if (!ret) { sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); @@ -9181,6 +9190,7 @@ out: btrfs_delalloc_release_space(inode, page_start, reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e4116f9248c2..ccee5417d3f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_io_tree *tree; + struct extent_changeset *data_reserved = NULL; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); file_end = (isize - 1) >> PAGE_SHIFT; @@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); if (ret) @@ -1247,6 +1248,7 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + extent_changeset_free(data_reserved); return i_done; out: for (i = 0; i < i_done; i++) { @@ -1256,6 +1258,7 @@ out: btrfs_delalloc_release_space(inode, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 475d53c492c8..002088937021 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2835,43 +2835,60 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. + * if btrfs_qgroup_reserve_data() is called multiple times with + * same @reserved, caller must ensure when error happens it's OK + * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator uiter; + struct extent_changeset *reserved; + u64 orig_reserved; + u64 to_reserve; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || !is_fstree(root->objectid) || len == 0) return 0; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* @reserved parameter is mandatory for qgroup */ + if (WARN_ON(!reserved_ret)) + return -EINVAL; + if (!*reserved_ret) { + *reserved_ret = extent_changeset_alloc(); + if (!*reserved_ret) + return -ENOMEM; + } + reserved = *reserved_ret; + /* Record already reserved space */ + orig_reserved = reserved->bytes_changed; ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, reserved); + + /* Newly reserved space */ + to_reserve = reserved->bytes_changed - orig_reserved; trace_btrfs_qgroup_reserve_data(inode, start, len, - changeset.bytes_changed, - QGROUP_RESERVE); + to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; - ret = qgroup_reserve(root, changeset.bytes_changed, true); + ret = qgroup_reserve(root, to_reserve, true); if (ret < 0) goto cleanup; - ulist_release(&changeset.range_changed); return ret; cleanup: - /* cleanup already reserved ranges */ + /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&changeset.range_changed, &uiter))) + while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, GFP_NOFS); - ulist_release(&changeset.range_changed); + extent_changeset_release(reserved); return ret; } @@ -2882,8 +2899,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, int trace_op = QGROUP_RELEASE; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) @@ -2899,7 +2915,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, changeset.bytes_changed); ret = changeset.bytes_changed; out: - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); return ret; } @@ -2999,8 +3015,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) struct ulist_iterator iter; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); @@ -3017,5 +3032,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) changeset.bytes_changed); } - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index d11125d6afb9..99408e93eb0d 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -242,7 +242,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif /* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b291d1bebb4c..6407423151ab 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset; + struct extent_changeset *data_reserved = NULL; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, prealloc_start, + ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, prealloc_end + 1 - prealloc_start); if (ret) goto out; @@ -3129,6 +3130,7 @@ int prealloc_file_extent_cluster(struct inode *inode, prealloc_end + 1 - cur_offset); out: inode_unlock(inode); + extent_changeset_free(data_reserved); return ret; } -- cgit v1.2.3 From bc42bda22345efdb5d8b578d1b4df2c6eaa85c58 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 27 Feb 2017 15:10:39 +0800 Subject: btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges [BUG] For the following case, btrfs can underflow qgroup reserved space at an error path: (Page size 4K, function name without "btrfs_" prefix) Task A | Task B ---------------------------------------------------------------------- Buffered_write [0, 2K) | |- check_data_free_space() | | |- qgroup_reserve_data() | | Range aligned to page | | range [0, 4K) <<< | | 4K bytes reserved <<< | |- copy pages to page cache | | Buffered_write [2K, 4K) | |- check_data_free_space() | | |- qgroup_reserved_data() | | Range alinged to page | | range [0, 4K) | | Already reserved by A <<< | | 0 bytes reserved <<< | |- delalloc_reserve_metadata() | | And it *FAILED* (Maybe EQUOTA) | |- free_reserved_data_space() |- qgroup_free_data() Range aligned to page range [0, 4K) Freeing 4K (Special thanks to Chandan for the detailed report and analyse) [CAUSE] Above Task B is freeing reserved data range [0, 4K) which is actually reserved by Task A. And at writeback time, page dirty by Task A will go through writeback routine, which will free 4K reserved data space at file extent insert time, causing the qgroup underflow. [FIX] For btrfs_qgroup_free_data(), add @reserved parameter to only free data ranges reserved by previous btrfs_qgroup_reserve_data(). So in above case, Task B will try to free 0 byte, so no underflow. Reported-by: Chandan Rajendra Signed-off-by: Qu Wenruo Reviewed-by: Chandan Rajendra Tested-by: Chandan Rajendra Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 +++-- fs/btrfs/extent-tree.c | 12 +++++---- fs/btrfs/file.c | 29 +++++++++++--------- fs/btrfs/inode.c | 29 ++++++++++---------- fs/btrfs/ioctl.c | 4 +-- fs/btrfs/qgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/qgroup.h | 3 ++- fs/btrfs/relocation.c | 8 +++--- 8 files changed, 117 insertions(+), 46 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1ee4489dc398..5bdd36664421 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2711,7 +2711,10 @@ enum btrfs_flush_state { int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2730,7 +2733,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 70f85cfdbd46..a3339acec28c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4389,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4399,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) start = round_down(start, root->fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); } static void force_metadata_allocation(struct btrfs_fs_info *info) @@ -6204,7 +6205,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, *reserved, start, len); return ret; } @@ -6223,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * list if there are no delalloc bytes left. * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len); - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, reserved, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1b5cce51728b..0f102a1b851f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1660,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, pos, - write_bytes); + btrfs_free_reserved_data_space(inode, + data_reserved, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1743,8 +1744,9 @@ again: __pos = round_down(pos, fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(inode, __pos, - release_bytes); + btrfs_delalloc_release_space(inode, + data_reserved, __pos, + release_bytes); } } @@ -1799,9 +1801,9 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); } else { - btrfs_delalloc_release_space(inode, - round_down(pos, fs_info->sectorsize), - release_bytes); + btrfs_delalloc_release_space(inode, data_reserved, + round_down(pos, fs_info->sectorsize), + release_bytes); } } @@ -2918,8 +2920,8 @@ static long btrfs_fallocate(struct file *file, int mode, * range, free reserved data space first, otherwise * it'll result in false ENOSPC error. */ - btrfs_free_reserved_data_space(inode, cur_offset, - last_byte - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2938,8 +2940,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->len, i_blocksize(inode), offset + len, &alloc_hint); else - btrfs_free_reserved_data_space(inode, range->start, - range->len); + btrfs_free_reserved_data_space(inode, + data_reserved, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2977,8 +2980,8 @@ out: inode_unlock(inode); /* Let go of our reservation. */ if (ret != 0) - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d9cf6df40d5e..5d3c6ac960fd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -345,7 +345,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -2935,7 +2935,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) @@ -4794,7 +4794,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, round_down(from, blocksize), blocksize); ret = -ENOMEM; @@ -4866,7 +4866,7 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, block_start, + btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); unlock_page(page); put_page(page); @@ -5266,7 +5266,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state->state & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, start, end - start + 1); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8792,8 +8792,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, offset, - dio_data.reserve); + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -8808,8 +8808,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.unsubmitted_oe_range_start, false); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, offset, - count - (size_t)ret); + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret); } out: if (wakeup) @@ -9008,7 +9008,7 @@ again: * free the entire extent. */ if (PageDirty(page)) - btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -9130,8 +9130,8 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, page_start, - PAGE_SIZE - reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE - reserved_space); } } @@ -9187,7 +9187,8 @@ out_unlock: } unlock_page(page); out: - btrfs_delalloc_release_space(inode, page_start, reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); @@ -10557,7 +10558,7 @@ next: btrfs_end_transaction(trans); } if (cur_offset < end) - btrfs_free_reserved_data_space(inode, cur_offset, + btrfs_free_reserved_data_space(inode, NULL, cur_offset, end - cur_offset + 1); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ccee5417d3f6..b4e9941efb60 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1227,7 +1227,7 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, (page_cnt - i_done) << PAGE_SHIFT); } @@ -1255,7 +1255,7 @@ out: unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 002088937021..fc9dffaa9524 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2892,13 +2892,72 @@ cleanup: return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, - int free) +/* Free ranges specified by @reserved, normally in error path */ +static int qgroup_free_reserved_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct extent_changeset changeset; + int freed = 0; + int ret; + + extent_changeset_init(&changeset); + len = round_up(start + len, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(&reserved->range_changed, &uiter))) { + u64 range_start = unode->val; + /* unode->aux is the inclusive end */ + u64 range_len = unode->aux - range_start + 1; + u64 free_start; + u64 free_len; + + extent_changeset_release(&changeset); + + /* Only free range in range [start, start + len) */ + if (range_start >= start + len || + range_start + range_len <= start) + continue; + free_start = max(range_start, start); + free_len = min(start + len, range_start + range_len) - + free_start; + /* + * TODO: To also modify reserved->ranges_reserved to reflect + * the modification. + * + * However as long as we free qgroup reserved according to + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + freed += changeset.bytes_changed; + } + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + ret = freed; +out: + extent_changeset_release(&changeset); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, + int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) + return qgroup_free_reserved_data(inode, reserved, start, len); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); @@ -2924,14 +2983,17 @@ out: * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. + * if @reserved is given, only reserved range in [@start, @start + @len) will + * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* @@ -2951,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 99408e93eb0d..d9984e87cddf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -245,7 +245,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, int btrfs_qgroup_reserve_data(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, bool enforce); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 6407423151ab..dc69b6ba29af 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3114,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; if (cur_offset < start) - btrfs_free_reserved_data_space(inode, cur_offset, - start - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); @@ -3126,8 +3126,8 @@ int prealloc_file_extent_cluster(struct inode *inode, nr++; } if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space(inode, cur_offset, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, prealloc_end + 1 - cur_offset); out: inode_unlock(inode); extent_changeset_free(data_reserved); -- cgit v1.2.3