From ee1293308e01d359688243d665138f35a6f1f9b8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 14 Sep 2023 09:06:56 -0700 Subject: btrfs: add raid stripe tree definitions Add definitions for the raid stripe tree. This tree will hold information about the on-disk layout of the stripes in a RAID set. Each stripe extent has a 1:1 relationship with an on-disk extent item and is doing the logical to per-drive physical address translation for the extent item in question. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index fc3c32186d7e..ca65d7b7a6ca 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -73,6 +73,9 @@ /* Holds the block group items for extent tree v2. */ #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL +/* Tracks RAID stripes in block groups. */ +#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL + /* device stats in the device tree */ #define BTRFS_DEV_STATS_OBJECTID 0ULL @@ -261,6 +264,8 @@ #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 +#define BTRFS_RAID_STRIPE_KEY 230 + /* * Records the overall state of the qgroups. * There's only one instance of this key present, @@ -719,6 +724,30 @@ struct btrfs_free_space_header { __le64 num_bitmaps; } __attribute__ ((__packed__)); +struct btrfs_raid_stride { + /* The id of device this raid extent lives on. */ + __le64 devid; + /* The physical location on disk. */ + __le64 physical; +} __attribute__ ((__packed__)); + +/* The stripe_extent::encoding, 1:1 mapping of enum btrfs_raid_types. */ +#define BTRFS_STRIPE_RAID0 1 +#define BTRFS_STRIPE_RAID1 2 +#define BTRFS_STRIPE_DUP 3 +#define BTRFS_STRIPE_RAID10 4 +#define BTRFS_STRIPE_RAID5 5 +#define BTRFS_STRIPE_RAID6 6 +#define BTRFS_STRIPE_RAID1C3 7 +#define BTRFS_STRIPE_RAID1C4 8 + +struct btrfs_stripe_extent { + __u8 encoding; + __u8 reserved[7]; + /* An array of raid strides this stripe is composed of. */ + struct btrfs_raid_stride strides[]; +} __attribute__ ((__packed__)); + #define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) #define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) -- cgit v1.2.3 From 515020900d447796bc2f0f57064663617a11b65d Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 14 Sep 2023 09:06:57 -0700 Subject: btrfs: read raid stripe tree from disk If we find the raid-stripe-tree on mount, read it from disk. This is a backward incompatible feature. The rescue=ignorebadroots mount option will skip this tree. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 6 ++++++ fs/btrfs/disk-io.c | 18 ++++++++++++++++++ fs/btrfs/fs.h | 1 + include/uapi/linux/btrfs.h | 1 + 4 files changed, 26 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 6a8f9629bbbd..ceb5f586a2d5 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -356,6 +356,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) min_items++; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item); + min_items++; + } + /* * But we also want to reserve enough space so we can do the fallback * global reserve for an unlink, which is an additional @@ -407,6 +412,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: case BTRFS_BLOCK_GROUP_TREE_OBJECTID: + case BTRFS_RAID_STRIPE_TREE_OBJECTID: root->block_rsv = &fs_info->delayed_refs_rsv; break; case BTRFS_ROOT_TREE_OBJECTID: diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 163f37ad1b27..dc577b3c53f6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1179,6 +1179,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, return btrfs_grab_root(fs_info->block_group_root); case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + case BTRFS_RAID_STRIPE_TREE_OBJECTID: + return btrfs_grab_root(fs_info->stripe_root); default: return NULL; } @@ -1259,6 +1261,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); + btrfs_put_root(fs_info->stripe_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1804,6 +1807,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); + free_root_extent_buffers(info->stripe_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2280,6 +2284,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->uuid_root = root; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->stripe_root = root; + } + } + return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 5559c33cd53a..b1914a9eb9f6 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -373,6 +373,7 @@ struct btrfs_fs_info { struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; + struct btrfs_root *stripe_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index dbb8b96da50d..b9a1d9af8ae8 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -333,6 +333,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_RAID1C34 (1ULL << 11) #define BTRFS_FEATURE_INCOMPAT_ZONED (1ULL << 12) #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13) +#define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14) struct btrfs_ioctl_feature_flags { __u64 compat_flags; -- cgit v1.2.3 From 182940f4f4dbd932776414744c8de64333957725 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 16 May 2023 16:35:45 -0700 Subject: btrfs: qgroup: add new quota mode for simple quotas Add a new quota mode called "simple quotas". It can be enabled by the existing quota enable ioctl via a new command, and sets an incompat bit, as the implementation of simple quotas will make backwards incompatible changes to the disk format of the extent tree. Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 6 +-- fs/btrfs/fs.h | 3 +- fs/btrfs/ioctl.c | 3 +- fs/btrfs/qgroup.c | 110 +++++++++++++++++++++++++++++----------- fs/btrfs/qgroup.h | 17 +++++-- fs/btrfs/root-tree.c | 2 +- fs/btrfs/transaction.c | 7 ++- include/uapi/linux/btrfs.h | 2 + include/uapi/linux/btrfs_tree.h | 10 +++- 9 files changed, 115 insertions(+), 45 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 25d0cdf85a91..0a80224c8784 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -959,8 +959,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { + if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); @@ -1063,8 +1062,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { + if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 19f9a444bcd8..e8fcf34f208a 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -220,7 +220,8 @@ enum { BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) #ifdef CONFIG_BTRFS_DEBUG /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5dbc3de66193..58189d99a22c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3697,7 +3697,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: - ret = btrfs_quota_enable(fs_info); + case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA: + ret = btrfs_quota_enable(fs_info, sa); break; case BTRFS_QUOTA_CTL_DISABLE: ret = btrfs_quota_disable(fs_info); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 69787b87a4f8..1c0efc1757c1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -34,9 +34,21 @@ enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info) { if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return BTRFS_QGROUP_MODE_DISABLED; + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) + return BTRFS_QGROUP_MODE_SIMPLE; return BTRFS_QGROUP_MODE_FULL; } +bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; +} + +bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; +} + /* * Helpers to access qgroup reservation * @@ -350,6 +362,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) { + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); @@ -370,8 +384,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) int ret = 0; u64 flags = 0; u64 rescan_progress = 0; + bool simple; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return 0; fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); @@ -421,14 +436,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) "old qgroup version, quota disabled"); goto out; } + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); + simple = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE); if (btrfs_qgroup_status_generation(l, ptr) != - fs_info->generation) { + fs_info->generation && !simple) { qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } - fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, - ptr); rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -571,7 +586,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) struct rb_node *node; bool ret = false; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return ret; /* * Since we're unmounting, there is no race and no need to grab qgroup @@ -970,7 +985,8 @@ out: return ret; } -int btrfs_quota_enable(struct btrfs_fs_info *fs_info) +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args) { struct btrfs_root *quota_root; struct btrfs_root *tree_root = fs_info->tree_root; @@ -983,6 +999,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; struct ulist *ulist = NULL; + const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; @@ -1085,8 +1102,11 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); - fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; + if (simple) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + else + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); @@ -1214,8 +1234,14 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + if (simple) + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); + /* Skip rescan for simple qgroups. */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + goto out_free_path; + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1330,6 +1356,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; spin_unlock(&fs_info->qgroup_lock); @@ -1810,6 +1837,9 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_extent_record *entry; u64 bytenr = record->bytenr; + if (!btrfs_qgroup_full_accounting(fs_info)) + return 0; + lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); @@ -1863,6 +1893,8 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; + if (!btrfs_qgroup_full_accounting(trans->fs_info)) + return 0; /* * We are always called in a context where we are already holding a * transaction handle. Often we are called when adding a data delayed @@ -1931,8 +1963,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_delayed_ref_root *delayed_refs; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) - || bytenr == 0 || num_bytes == 0) + if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) return 0; record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) @@ -1970,7 +2001,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, u64 bytenr, num_bytes; /* We can be called directly from walk_up_proc() */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; for (i = 0; i < nr; i++) { @@ -2346,7 +2377,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, int level; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* Wrong parameter order */ @@ -2413,7 +2444,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); BUG_ON(root_eb == NULL); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; spin_lock(&fs_info->qgroup_lock); @@ -2747,7 +2778,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (!btrfs_qgroup_full_accounting(fs_info) || fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; @@ -2816,6 +2847,9 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) u64 qgroup_to_skip; int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return 0; + delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; while ((node = rb_first(&delayed_refs->dirty_extent_root))) { @@ -2931,7 +2965,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_enabled(fs_info)) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; else fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; @@ -2990,7 +3024,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_enabled(fs_info)) goto out; quota_root = fs_info->quota_root; @@ -3076,7 +3110,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); } - if (srcid) { + if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; @@ -3339,6 +3373,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, int slot; int ret; + if (!btrfs_qgroup_full_accounting(fs_info)) + return 1; + mutex_lock(&fs_info->qgroup_rescan_lock); extent_root = btrfs_extent_root(fs_info, fs_info->qgroup_rescan_progress.objectid); @@ -3419,10 +3456,15 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { - return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || - !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; + if (btrfs_fs_closing(fs_info)) + return true; + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + return true; + if (!btrfs_qgroup_enabled(fs_info)) + return true; + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) + return true; + return false; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3436,6 +3478,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) bool stopped = false; bool did_leaf_rescans = false; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; + path = btrfs_alloc_path(); if (!path) goto out; @@ -3539,6 +3584,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, { int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { + btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); + return -EINVAL; + } + if (!init_flags) { /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags & @@ -3569,7 +3619,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; - } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */ ret = -EBUSY; } @@ -3828,7 +3878,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, u64 to_reserve; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || + if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || len == 0) return 0; @@ -3960,7 +4010,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int trace_op = QGROUP_RELEASE; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) + if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) return 0; /* In release case, we shouldn't have @reserved */ @@ -4071,7 +4121,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; @@ -4116,7 +4166,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; @@ -4132,7 +4182,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; @@ -4191,7 +4241,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ @@ -4299,7 +4349,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > @@ -4409,7 +4459,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, int ret = 0; int i; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (!is_fstree(root->root_key.objectid) || !root->reloc_root) return 0; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index aed611774047..0441d2ff5a49 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -101,8 +101,15 @@ * subtree rescan for them. */ -#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3) -#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4) +/* + * These flags share the flags field of the btrfs_qgroup_status_item with the + * persisted flags defined in btrfs_tree.h. + * + * To minimize the chance of collision with new persisted status flags, these + * count backwards from the MSB. + */ +#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) +#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) /* * Record a dirty extent, and info qgroup to update quota on it @@ -276,13 +283,17 @@ enum { ENUM_BIT(QGROUP_FREE), }; -int btrfs_quota_enable(struct btrfs_fs_info *fs_info); enum btrfs_qgroup_mode { BTRFS_QGROUP_MODE_DISABLED, BTRFS_QGROUP_MODE_FULL, + BTRFS_QGROUP_MODE_SIMPLE }; enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info); +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7c9d87b9b72e..603ad1459368 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -510,7 +510,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + if (btrfs_qgroup_enabled(fs_info)) { /* One for parent inode, two for dir entries */ qgroup_num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_prealloc(root, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3f9f933039c6..f5c3ea01c3d6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1620,11 +1620,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, int ret; /* - * Save some performance in the case that qgroups are not - * enabled. If this check races with the ioctl, rescan will - * kick in anyway. + * Save some performance in the case that full qgroups are not enabled. + * If this check races with the ioctl, rescan will kick in anyway. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index b9a1d9af8ae8..7c29d82db9ee 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -334,6 +334,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_ZONED (1ULL << 12) #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13) #define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14) +#define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16) struct btrfs_ioctl_feature_flags { __u64 compat_flags; @@ -754,6 +755,7 @@ struct btrfs_ioctl_get_dev_stats { #define BTRFS_QUOTA_CTL_ENABLE 1 #define BTRFS_QUOTA_CTL_DISABLE 2 #define BTRFS_QUOTA_CTL_RESCAN__NOTUSED 3 +#define BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA 4 struct btrfs_ioctl_quota_ctl_args { __u64 cmd; __u64 status; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index ca65d7b7a6ca..a1d5347f6adb 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1233,9 +1233,17 @@ static inline __u16 btrfs_qgroup_level(__u64 qgroupid) */ #define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) +/* + * Whether or not this filesystem is using simple quotas. Not exactly the + * incompat bit, because we support using simple quotas, disabling it, then + * going back to full qgroup quotas. + */ +#define BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE (1ULL << 3) + #define BTRFS_QGROUP_STATUS_FLAGS_MASK (BTRFS_QGROUP_STATUS_FLAG_ON | \ BTRFS_QGROUP_STATUS_FLAG_RESCAN | \ - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | \ + BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) #define BTRFS_QGROUP_STATUS_VERSION 1 -- cgit v1.2.3 From d9a620f77e33f2b0e9a5f131f3ee3c66d3285c57 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 30 Jan 2023 14:45:55 -0800 Subject: btrfs: new inline ref storing owning subvol of data extents In order to implement simple quota groups, we need to be able to associate a data extent with the subvolume that created it. Once you account for reflink, this information cannot be recovered without explicitly storing it. Options for storing it are: - a new key/item - a new extent inline ref item The former is backwards compatible, but wastes space, the latter is incompat, but is efficient in space and reuses the existing inline ref machinery, while only abusing it a tiny amount -- specifically, the new item is not a ref, per-se. Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 5 ++++ fs/btrfs/backref.c | 3 +++ fs/btrfs/extent-tree.c | 57 +++++++++++++++++++++++++++++++++-------- fs/btrfs/print-tree.c | 12 +++++++++ fs/btrfs/ref-verify.c | 3 +++ fs/btrfs/tree-checker.c | 3 +++ include/uapi/linux/btrfs_tree.h | 12 +++++++++ 7 files changed, 84 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index ce18e261c861..fb1c1a1983f2 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -358,6 +358,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3 BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); +BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref, + root_id, 64); + BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, type, 8); BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, @@ -374,6 +377,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type) if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); return 0; } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b7d54efb4728..0cde873bdee2 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1129,6 +1129,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, count, sc, GFP_NOFS); break; } + case BTRFS_EXTENT_OWNER_REF_KEY: + ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA)); + break; default: WARN_ON(1); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2dda3ecb49e6..8bd556c8bca4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -345,9 +345,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { + struct btrfs_fs_info *fs_info = eb->fs_info; int type = btrfs_extent_inline_ref_type(eb, iref); u64 offset = btrfs_extent_inline_ref_offset(eb, iref); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + return type; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY || type == BTRFS_SHARED_DATA_REF_KEY || @@ -356,26 +362,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_TREE_BLOCK_REF_KEY) return type; if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ - if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { if (type == BTRFS_EXTENT_DATA_REF_KEY) return type; if (type == BTRFS_SHARED_DATA_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else { @@ -386,7 +391,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, WARN_ON(1); btrfs_print_leaf(eb); - btrfs_err(eb->fs_info, + btrfs_err(fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); @@ -887,6 +892,11 @@ again: while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ptr += btrfs_extent_inline_ref_size(type); + continue; + } if (type == BTRFS_REF_TYPE_INVALID) { ret = -EUCLEAN; goto out; @@ -1742,6 +1752,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->type == BTRFS_SHARED_DATA_REF_KEY) ret = run_delayed_data_ref(trans, node, extent_op, insert_reserved); + else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) + ret = 0; else BUG(); if (ret && insert_reserved) @@ -2313,6 +2325,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_extent_item *ei; struct btrfs_key key; u32 item_size; + u32 expected_size; int type; int ret; @@ -2339,10 +2352,22 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); + + /* No inline refs; we need to bail before checking for owner ref. */ + if (item_size == sizeof(*ei)) + goto out; + + /* Check for an owner ref; skip over it to the real inline refs. */ + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { + expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + iref = (struct btrfs_extent_inline_ref *)(iref + 1); + } /* If extent item has more than 1 inline ref then it's shared */ - if (item_size != sizeof(*ei) + - btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + if (item_size != expected_size) goto out; /* @@ -2354,8 +2379,6 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_root_last_snapshot(&root->root_item))) goto out; - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - /* If this extent has SHARED_DATA_REF then it's shared */ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) @@ -4617,18 +4640,23 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; + struct btrfs_extent_owner_ref *oref; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; int type; u32 size; + const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; - size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + size = sizeof(*extent_item); + if (simple_quota) + size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + size += btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); if (!path) @@ -4650,7 +4678,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, flags | BTRFS_EXTENT_FLAG_DATA); iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + if (simple_quota) { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + btrfs_set_extent_owner_ref_root_id(leaf, oref, root_objectid); + iref = (struct btrfs_extent_inline_ref *)(oref + 1); + } btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { struct btrfs_shared_data_ref *ref; ref = (struct btrfs_shared_data_ref *)(iref + 1); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 75042a7fcf30..7e46aa8a0444 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -83,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } +static void print_extent_owner_ref(const struct extent_buffer *eb, + const struct btrfs_extent_owner_ref *ref) +{ + ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA)); + pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref)); +} + static void print_extent_item(const struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; + struct btrfs_extent_owner_ref *oref; struct btrfs_disk_key key; unsigned long end; unsigned long ptr; @@ -164,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type "\t\t\t(parent %llu not aligned to sectorsize %u)\n", offset, eb->fs_info->sectorsize); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + print_extent_owner_ref(eb, oref); + break; default: pr_cont("(extent %llu has INVALID ref type %d)\n", eb->start, type); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e9e1ebd8dd6a..1f62976bee82 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -485,6 +485,9 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, ret = add_shared_data_ref(fs_info, offset, count, key->objectid, key->offset); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + break; default: btrfs_err(fs_info, "invalid key type in iref"); ret = -EINVAL; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 1547090bf2c0..1f2c389b0bfa 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1467,6 +1467,9 @@ static int check_extent_item(struct extent_buffer *leaf, } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + break; default: extent_err(leaf, slot, "unknown inline ref type: %u", inline_type); diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index a1d5347f6adb..8b4def6aee52 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -233,6 +233,14 @@ #define BTRFS_SHARED_DATA_REF_KEY 184 +/* + * Special inline ref key which stores the id of the subvolume which originally + * created the extent. This subvolume owns the extent permanently from the + * perspective of simple quotas. Needed to know which subvolume to free quota + * usage from when the extent is deleted. + */ +#define BTRFS_EXTENT_OWNER_REF_KEY 188 + /* * block groups give us hints into the extent allocation trees. Which * blocks are free etc etc @@ -816,6 +824,10 @@ struct btrfs_shared_data_ref { __le32 count; } __attribute__ ((__packed__)); +struct btrfs_extent_owner_ref { + __le64 root_id; +} __attribute__ ((__packed__)); + struct btrfs_extent_inline_ref { __u8 type; __le64 offset; -- cgit v1.2.3 From bd7c1ea3a302aba727a1ced9937ec84c6407724e Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 28 Mar 2023 15:45:20 -0700 Subject: btrfs: qgroup: check generation when recording simple quota delta Simple quotas count extents only from the moment the feature is enabled. Therefore, if we do something like: 1. create subvol S 2. write F in S 3. enable quotas 4. remove F 5. write G in S then after 3. and 4. we would expect the simple quota usage of S to be 0 (putting aside some metadata extents that might be written) and after 5., it should be the size of G plus metadata. Therefore, we need to be able to determine whether a particular quota delta we are processing predates simple quota enablement. To do this, store the transaction id when quotas were enabled. In fs_info for immediate use and in the quota status item to make it recoverable on mount. When we see a delta, check if the generation of the extent item is less than that of quota enablement. If so, we should ignore the delta from this extent. Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 2 ++ fs/btrfs/extent-tree.c | 4 ++++ fs/btrfs/fs.h | 1 + fs/btrfs/qgroup.c | 28 ++++++++++++++++++++++------ fs/btrfs/qgroup.h | 2 ++ include/uapi/linux/btrfs_tree.h | 9 +++++++++ 6 files changed, 40 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index fb1c1a1983f2..aa0844535644 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -980,6 +980,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, rescan, 64); +BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item, + enable_gen, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 865c74dffe82..8372d065aef4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1566,6 +1566,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, .rsv_bytes = href->reserved_bytes, .is_data = true, .is_inc = true, + .generation = trans->transid, }; if (extent_op) @@ -1738,6 +1739,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, .rsv_bytes = 0, .is_data = false, .is_inc = true, + .generation = trans->transid, }; BUG_ON(!extent_op || !extent_op->update_flags); @@ -3287,6 +3289,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, .rsv_bytes = 0, .is_data = is_data, .is_inc = false, + .generation = btrfs_extent_generation(leaf, ei), }; /* In this branch refs == 1 */ @@ -4924,6 +4927,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_squota_delta delta = { .root = root_objectid, .num_bytes = ins->offset, + .generation = trans->transid, .rsv_bytes = 0, .is_data = true, .is_inc = true, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index e8fcf34f208a..2bd9bedc7095 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -687,6 +687,7 @@ struct btrfs_fs_info { /* Protected by qgroup_rescan_lock */ bool qgroup_rescan_running; u8 qgroup_drop_subtree_thres; + u64 qgroup_enable_gen; /* * If this is not 0, then it indicates a serious filesystem error has diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5b3ad2f15039..f11f0fc8a9ff 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -369,6 +369,15 @@ static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); } +static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot, + struct btrfs_qgroup_status_item *ptr) +{ + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr)); + fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr); +} + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -384,7 +393,6 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) int ret = 0; u64 flags = 0; u64 rescan_progress = 0; - bool simple; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return 0; @@ -437,9 +445,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); - simple = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE); - if (btrfs_qgroup_status_generation(l, ptr) != - fs_info->generation && !simple) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { + qgroup_read_enable_gen(fs_info, l, slot, ptr); + } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); @@ -1103,10 +1111,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; - if (simple) + if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; - else + btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); + } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); @@ -1210,6 +1220,8 @@ out_add_root: goto out_free_path; } + fs_info->qgroup_enable_gen = trans->transid; + mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Commit the transaction while not holding qgroup_ioctl_lock, to avoid @@ -4655,6 +4667,10 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, if (!is_fstree(root)) return 0; + /* If the extent predates enabling quotas, don't count it. */ + if (delta->generation < fs_info->qgroup_enable_gen) + return 0; + spin_lock(&fs_info->qgroup_lock); qgroup = find_qgroup_rb(fs_info, root); if (!qgroup) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 9858244eef20..855a4f978761 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -276,6 +276,8 @@ struct btrfs_squota_delta { u64 num_bytes; /* The number of bytes reserved for this extent. */ u64 rsv_bytes; + /* The generation the extent was created in. */ + u64 generation; /* Whether we are using or freeing the extent. */ bool is_inc; /* Whether the extent is data or metadata. */ diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 8b4def6aee52..c25fc9614594 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1277,6 +1277,15 @@ struct btrfs_qgroup_status_item { * of the scan. It contains a logical address */ __le64 rescan; + + /* + * The generation when quotas were last enabled. Used by simple quotas to + * avoid decrementing when freeing an extent that was written before + * enable. + * + * Set only if flags contain BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE. + */ + __le64 enable_gen; } __attribute__ ((__packed__)); struct btrfs_qgroup_info_item { -- cgit v1.2.3