summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJosef Bacik <josef@toxicpanda.com>2019-06-20 15:37:55 -0400
committerDavid Sterba <dsterba@suse.com>2019-09-09 14:59:08 +0200
commite3e0520b32bc3dbc64110536d171bfb334ac7a2a (patch)
tree86ee7f3afc767568e685b6556384dee0d7c93938
parent3b2a78f21d5c53ff34b8e03cba4f904c91d4b3a2 (diff)
downloadlwn-e3e0520b32bc3dbc64110536d171bfb334ac7a2a.tar.gz
lwn-e3e0520b32bc3dbc64110536d171bfb334ac7a2a.zip
btrfs: migrate the block group removal code
This is the removal code and the unused bgs code. Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> [ refresh, move clear_incompat_bg_bits ] Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/block-group.c540
-rw-r--r--fs/btrfs/block-group.h7
-rw-r--r--fs/btrfs/ctree.h7
-rw-r--r--fs/btrfs/extent-tree.c537
4 files changed, 547 insertions, 544 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 643a2f16603b..a27f814b86bd 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -6,6 +6,10 @@
#include "disk-io.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "transaction.h"
+#include "ref-verify.h"
void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
{
@@ -660,3 +664,539 @@ int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
return ret;
}
+
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ write_seqlock(&fs_info->profiles_lock);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits &= ~extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
+}
+
+/*
+ * Clear incompat bits for the following feature(s):
+ *
+ * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
+ * in the whole filesystem
+ */
+static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ struct list_head *head = &fs_info->space_info;
+ struct btrfs_space_info *sinfo;
+
+ list_for_each_entry_rcu(sinfo, head, list) {
+ bool found = false;
+
+ down_read(&sinfo->groups_sem);
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
+ found = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
+ found = true;
+ up_read(&sinfo->groups_sem);
+
+ if (found)
+ return;
+ }
+ btrfs_clear_fs_incompat(fs_info, RAID56);
+ }
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ u64 group_start, struct extent_map *em)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_path *path;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_free_cluster *cluster;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_key key;
+ struct inode *inode;
+ struct kobject *kobj = NULL;
+ int ret;
+ int index;
+ int factor;
+ struct btrfs_caching_control *caching_ctl = NULL;
+ bool remove_em;
+ bool remove_rsv = false;
+
+ block_group = btrfs_lookup_block_group(fs_info, group_start);
+ BUG_ON(!block_group);
+ BUG_ON(!block_group->ro);
+
+ trace_btrfs_remove_block_group(block_group);
+ /*
+ * Free the reserved super bytes from this block group before
+ * remove it.
+ */
+ btrfs_free_excluded_extents(block_group);
+ btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
+ block_group->key.offset);
+
+ memcpy(&key, &block_group->key, sizeof(key));
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+ factor = btrfs_bg_type_to_factor(block_group->flags);
+
+ /* make sure this block group isn't part of an allocation cluster */
+ cluster = &fs_info->data_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ /*
+ * make sure this block group isn't part of a metadata
+ * allocation cluster
+ */
+ cluster = &fs_info->meta_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * get the inode first so any iput calls done for the io_list
+ * aren't the final iput (no unlinks allowed now)
+ */
+ inode = lookup_free_space_inode(block_group, path);
+
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ /*
+ * Make sure our free space cache IO is done before removing the
+ * free space inode
+ */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ btrfs_wait_cache_io(trans, block_group, path);
+ btrfs_put_block_group(block_group);
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ }
+
+ if (!list_empty(&block_group->dirty_list)) {
+ list_del_init(&block_group->dirty_list);
+ remove_rsv = true;
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+ if (!IS_ERR(inode)) {
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
+ clear_nlink(inode);
+ /* One for the block groups ref */
+ spin_lock(&block_group->lock);
+ if (block_group->iref) {
+ block_group->iref = 0;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ /* One for our lookup ref */
+ btrfs_add_delayed_iput(inode);
+ }
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = block_group->key.objectid;
+ key.type = 0;
+
+ ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ btrfs_release_path(path);
+ if (ret == 0) {
+ ret = btrfs_del_item(trans, tree_root, path);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ spin_lock(&fs_info->block_group_cache_lock);
+ rb_erase(&block_group->cache_node,
+ &fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
+
+ if (fs_info->first_logical_byte == block_group->key.objectid)
+ fs_info->first_logical_byte = (u64)-1;
+ spin_unlock(&fs_info->block_group_cache_lock);
+
+ down_write(&block_group->space_info->groups_sem);
+ /*
+ * we must use list_del_init so people can check to see if they
+ * are still on the list after taking the semaphore
+ */
+ list_del_init(&block_group->list);
+ if (list_empty(&block_group->space_info->block_groups[index])) {
+ kobj = block_group->space_info->block_group_kobjs[index];
+ block_group->space_info->block_group_kobjs[index] = NULL;
+ clear_avail_alloc_bits(fs_info, block_group->flags);
+ }
+ up_write(&block_group->space_info->groups_sem);
+ clear_incompat_bg_bits(fs_info, block_group->flags);
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
+
+ if (block_group->has_caching_ctl)
+ caching_ctl = btrfs_get_caching_control(block_group);
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ btrfs_wait_block_group_cache_done(block_group);
+ if (block_group->has_caching_ctl) {
+ down_write(&fs_info->commit_root_sem);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl,
+ &fs_info->caching_block_groups, list)
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ refcount_inc(&caching_ctl->count);
+ break;
+ }
+ }
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ up_write(&fs_info->commit_root_sem);
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ }
+ }
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ WARN_ON(!list_empty(&block_group->dirty_list));
+ WARN_ON(!list_empty(&block_group->io_list));
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ btrfs_remove_free_space_cache(block_group);
+
+ spin_lock(&block_group->space_info->lock);
+ list_del_init(&block_group->ro_list);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ WARN_ON(block_group->space_info->total_bytes
+ < block_group->key.offset);
+ WARN_ON(block_group->space_info->bytes_readonly
+ < block_group->key.offset);
+ WARN_ON(block_group->space_info->disk_total
+ < block_group->key.offset * factor);
+ }
+ block_group->space_info->total_bytes -= block_group->key.offset;
+ block_group->space_info->bytes_readonly -= block_group->key.offset;
+ block_group->space_info->disk_total -= block_group->key.offset * factor;
+
+ spin_unlock(&block_group->space_info->lock);
+
+ memcpy(&key, &block_group->key, sizeof(key));
+
+ mutex_lock(&fs_info->chunk_mutex);
+ spin_lock(&block_group->lock);
+ block_group->removed = 1;
+ /*
+ * At this point trimming can't start on this block group, because we
+ * removed the block group from the tree fs_info->block_group_cache_tree
+ * so no one can't find it anymore and even if someone already got this
+ * block group before we removed it from the rbtree, they have already
+ * incremented block_group->trimming - if they didn't, they won't find
+ * any free space entries because we already removed them all when we
+ * called btrfs_remove_free_space_cache().
+ *
+ * And we must not remove the extent map from the fs_info->mapping_tree
+ * to prevent the same logical address range and physical device space
+ * ranges from being reused for a new block group. This is because our
+ * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * completely transactionless, so while it is trimming a range the
+ * currently running transaction might finish and a new one start,
+ * allowing for new block groups to be created that can reuse the same
+ * physical device locations unless we take this special care.
+ *
+ * There may also be an implicit trim operation if the file system
+ * is mounted with -odiscard. The same protections must remain
+ * in place until the extents have been discarded completely when
+ * the transaction commit has completed.
+ */
+ remove_em = (atomic_read(&block_group->trimming) == 0);
+ spin_unlock(&block_group->lock);
+
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ ret = remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
+
+ btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -EIO;
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+
+ if (remove_em) {
+ struct extent_map_tree *em_tree;
+
+ em_tree = &fs_info->mapping_tree;
+ write_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+out:
+ if (remove_rsv)
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info, const u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = em->map_lookup;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+ num_items, 1);
+}
+
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ u64 start, end;
+ int trimming;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+ spin_lock(&block_group->lock);
+ if (block_group->reserved || block_group->pinned ||
+ btrfs_block_group_used(&block_group->item) ||
+ block_group->ro ||
+ list_is_singular(&block_group->list)) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = __btrfs_inc_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ trans = btrfs_start_trans_remove_block_group(fs_info,
+ block_group->key.objectid);
+ if (IS_ERR(trans)) {
+ btrfs_dec_block_group_ro(block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ start = block_group->key.objectid;
+ end = start + block_group->key.offset - 1;
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N,
+ * another task might be running finish_extent_commit() for the
+ * previous transaction N - 1, and have seen a range belonging
+ * to the block group in freed_extents[] before we were able to
+ * clear the whole block group range from freed_extents[]. This
+ * means that task can lookup for the block group after we
+ * unpinned it from freed_extents[] and removed it, leading to
+ * a BUG_ON() at btrfs_unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ EXTENT_DIRTY);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_dec_block_group_ro(block_group);
+ goto end_trans;
+ }
+ ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ EXTENT_DIRTY);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_dec_block_group_ro(block_group);
+ goto end_trans;
+ }
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+
+ btrfs_space_info_update_bytes_pinned(fs_info, space_info,
+ -block_group->pinned);
+ space_info->bytes_readonly += block_group->pinned;
+ percpu_counter_add_batch(&space_info->total_bytes_pinned,
+ -block_group->pinned,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ block_group->pinned = 0;
+
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ /* DISCARD can flip during remount */
+ trimming = btrfs_test_opt(fs_info, DISCARD);
+
+ /* Implicit trim during transaction commit. */
+ if (trimming)
+ btrfs_get_block_group_trimming(block_group);
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, block_group->key.objectid);
+
+ if (ret) {
+ if (trimming)
+ btrfs_put_block_group_trimming(block_group);
+ goto end_trans;
+ }
+
+ /*
+ * If we're not mounted with -odiscard, we can just forget
+ * about this block group. Otherwise we'll need to wait
+ * until transaction commit to do the actual discard.
+ */
+ if (trimming) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
+ list_move(&block_group->bg_list,
+ &trans->transaction->deleted_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_get_block_group(block_group);
+ }
+end_trans:
+ btrfs_end_transaction(trans);
+next:
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ trace_btrfs_add_unused_block_group(bg);
+ list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 143baaa54684..f1fe14ba2702 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -176,6 +176,13 @@ struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group_cache *cache);
u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
u64 start, u64 end);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ u64 group_start, struct extent_map *em);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg);
static inline int btrfs_block_group_cache_done(
struct btrfs_block_group_cache *cache)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 17eb4c91f0e1..aedee3f66764 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2532,12 +2532,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 bytes_used, u64 type, u64 chunk_offset,
u64 size);
-struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
- struct btrfs_fs_info *fs_info,
- const u64 chunk_offset);
-int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em);
-void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
@@ -2618,7 +2612,6 @@ int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 08bd67169590..775d78a101b0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7501,530 +7501,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
return 0;
}
-static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 extra_flags = chunk_to_extended(flags) &
- BTRFS_EXTENDED_PROFILE_MASK;
-
- write_seqlock(&fs_info->profiles_lock);
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- fs_info->avail_data_alloc_bits &= ~extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
- fs_info->avail_metadata_alloc_bits &= ~extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- fs_info->avail_system_alloc_bits &= ~extra_flags;
- write_sequnlock(&fs_info->profiles_lock);
-}
-
-/*
- * Clear incompat bits for the following feature(s):
- *
- * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
- * in the whole filesystem
- */
-static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
-{
- if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- struct list_head *head = &fs_info->space_info;
- struct btrfs_space_info *sinfo;
-
- list_for_each_entry_rcu(sinfo, head, list) {
- bool found = false;
-
- down_read(&sinfo->groups_sem);
- if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
- found = true;
- if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
- found = true;
- up_read(&sinfo->groups_sem);
-
- if (found)
- return;
- }
- btrfs_clear_fs_incompat(fs_info, RAID56);
- }
-}
-
-int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_path *path;
- struct btrfs_block_group_cache *block_group;
- struct btrfs_free_cluster *cluster;
- struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_key key;
- struct inode *inode;
- struct kobject *kobj = NULL;
- int ret;
- int index;
- int factor;
- struct btrfs_caching_control *caching_ctl = NULL;
- bool remove_em;
- bool remove_rsv = false;
-
- block_group = btrfs_lookup_block_group(fs_info, group_start);
- BUG_ON(!block_group);
- BUG_ON(!block_group->ro);
-
- trace_btrfs_remove_block_group(block_group);
- /*
- * Free the reserved super bytes from this block group before
- * remove it.
- */
- btrfs_free_excluded_extents(block_group);
- btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
- block_group->key.offset);
-
- memcpy(&key, &block_group->key, sizeof(key));
- index = btrfs_bg_flags_to_raid_index(block_group->flags);
- factor = btrfs_bg_type_to_factor(block_group->flags);
-
- /* make sure this block group isn't part of an allocation cluster */
- cluster = &fs_info->data_alloc_cluster;
- spin_lock(&cluster->refill_lock);
- btrfs_return_cluster_to_free_space(block_group, cluster);
- spin_unlock(&cluster->refill_lock);
-
- /*
- * make sure this block group isn't part of a metadata
- * allocation cluster
- */
- cluster = &fs_info->meta_alloc_cluster;
- spin_lock(&cluster->refill_lock);
- btrfs_return_cluster_to_free_space(block_group, cluster);
- spin_unlock(&cluster->refill_lock);
-
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- /*
- * get the inode first so any iput calls done for the io_list
- * aren't the final iput (no unlinks allowed now)
- */
- inode = lookup_free_space_inode(block_group, path);
-
- mutex_lock(&trans->transaction->cache_write_mutex);
- /*
- * Make sure our free space cache IO is done before removing the
- * free space inode
- */
- spin_lock(&trans->transaction->dirty_bgs_lock);
- if (!list_empty(&block_group->io_list)) {
- list_del_init(&block_group->io_list);
-
- WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
-
- spin_unlock(&trans->transaction->dirty_bgs_lock);
- btrfs_wait_cache_io(trans, block_group, path);
- btrfs_put_block_group(block_group);
- spin_lock(&trans->transaction->dirty_bgs_lock);
- }
-
- if (!list_empty(&block_group->dirty_list)) {
- list_del_init(&block_group->dirty_list);
- remove_rsv = true;
- btrfs_put_block_group(block_group);
- }
- spin_unlock(&trans->transaction->dirty_bgs_lock);
- mutex_unlock(&trans->transaction->cache_write_mutex);
-
- if (!IS_ERR(inode)) {
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret) {
- btrfs_add_delayed_iput(inode);
- goto out;
- }
- clear_nlink(inode);
- /* One for the block groups ref */
- spin_lock(&block_group->lock);
- if (block_group->iref) {
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- iput(inode);
- } else {
- spin_unlock(&block_group->lock);
- }
- /* One for our lookup ref */
- btrfs_add_delayed_iput(inode);
- }
-
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = block_group->key.objectid;
- key.type = 0;
-
- ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
- goto out;
- if (ret > 0)
- btrfs_release_path(path);
- if (ret == 0) {
- ret = btrfs_del_item(trans, tree_root, path);
- if (ret)
- goto out;
- btrfs_release_path(path);
- }
-
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&block_group->cache_node,
- &fs_info->block_group_cache_tree);
- RB_CLEAR_NODE(&block_group->cache_node);
-
- if (fs_info->first_logical_byte == block_group->key.objectid)
- fs_info->first_logical_byte = (u64)-1;
- spin_unlock(&fs_info->block_group_cache_lock);
-
- down_write(&block_group->space_info->groups_sem);
- /*
- * we must use list_del_init so people can check to see if they
- * are still on the list after taking the semaphore
- */
- list_del_init(&block_group->list);
- if (list_empty(&block_group->space_info->block_groups[index])) {
- kobj = block_group->space_info->block_group_kobjs[index];
- block_group->space_info->block_group_kobjs[index] = NULL;
- clear_avail_alloc_bits(fs_info, block_group->flags);
- }
- up_write(&block_group->space_info->groups_sem);
- clear_incompat_bg_bits(fs_info, block_group->flags);
- if (kobj) {
- kobject_del(kobj);
- kobject_put(kobj);
- }
-
- if (block_group->has_caching_ctl)
- caching_ctl = btrfs_get_caching_control(block_group);
- if (block_group->cached == BTRFS_CACHE_STARTED)
- btrfs_wait_block_group_cache_done(block_group);
- if (block_group->has_caching_ctl) {
- down_write(&fs_info->commit_root_sem);
- if (!caching_ctl) {
- struct btrfs_caching_control *ctl;
-
- list_for_each_entry(ctl,
- &fs_info->caching_block_groups, list)
- if (ctl->block_group == block_group) {
- caching_ctl = ctl;
- refcount_inc(&caching_ctl->count);
- break;
- }
- }
- if (caching_ctl)
- list_del_init(&caching_ctl->list);
- up_write(&fs_info->commit_root_sem);
- if (caching_ctl) {
- /* Once for the caching bgs list and once for us. */
- btrfs_put_caching_control(caching_ctl);
- btrfs_put_caching_control(caching_ctl);
- }
- }
-
- spin_lock(&trans->transaction->dirty_bgs_lock);
- WARN_ON(!list_empty(&block_group->dirty_list));
- WARN_ON(!list_empty(&block_group->io_list));
- spin_unlock(&trans->transaction->dirty_bgs_lock);
-
- btrfs_remove_free_space_cache(block_group);
-
- spin_lock(&block_group->space_info->lock);
- list_del_init(&block_group->ro_list);
-
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
- WARN_ON(block_group->space_info->total_bytes
- < block_group->key.offset);
- WARN_ON(block_group->space_info->bytes_readonly
- < block_group->key.offset);
- WARN_ON(block_group->space_info->disk_total
- < block_group->key.offset * factor);
- }
- block_group->space_info->total_bytes -= block_group->key.offset;
- block_group->space_info->bytes_readonly -= block_group->key.offset;
- block_group->space_info->disk_total -= block_group->key.offset * factor;
-
- spin_unlock(&block_group->space_info->lock);
-
- memcpy(&key, &block_group->key, sizeof(key));
-
- mutex_lock(&fs_info->chunk_mutex);
- spin_lock(&block_group->lock);
- block_group->removed = 1;
- /*
- * At this point trimming can't start on this block group, because we
- * removed the block group from the tree fs_info->block_group_cache_tree
- * so no one can't find it anymore and even if someone already got this
- * block group before we removed it from the rbtree, they have already
- * incremented block_group->trimming - if they didn't, they won't find
- * any free space entries because we already removed them all when we
- * called btrfs_remove_free_space_cache().
- *
- * And we must not remove the extent map from the fs_info->mapping_tree
- * to prevent the same logical address range and physical device space
- * ranges from being reused for a new block group. This is because our
- * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
- * completely transactionless, so while it is trimming a range the
- * currently running transaction might finish and a new one start,
- * allowing for new block groups to be created that can reuse the same
- * physical device locations unless we take this special care.
- *
- * There may also be an implicit trim operation if the file system
- * is mounted with -odiscard. The same protections must remain
- * in place until the extents have been discarded completely when
- * the transaction commit has completed.
- */
- remove_em = (atomic_read(&block_group->trimming) == 0);
- spin_unlock(&block_group->lock);
-
- mutex_unlock(&fs_info->chunk_mutex);
-
- ret = remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out;
-
- btrfs_put_block_group(block_group);
- btrfs_put_block_group(block_group);
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret > 0)
- ret = -EIO;
- if (ret < 0)
- goto out;
-
- ret = btrfs_del_item(trans, root, path);
- if (ret)
- goto out;
-
- if (remove_em) {
- struct extent_map_tree *em_tree;
-
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- /* once for the tree */
- free_extent_map(em);
- }
-out:
- if (remove_rsv)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
- btrfs_free_path(path);
- return ret;
-}
-
-struct btrfs_trans_handle *
-btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
- const u64 chunk_offset)
-{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
- unsigned int num_items;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
- ASSERT(em && em->start == chunk_offset);
-
- /*
- * We need to reserve 3 + N units from the metadata space info in order
- * to remove a block group (done at btrfs_remove_chunk() and at
- * btrfs_remove_block_group()), which are used for:
- *
- * 1 unit for adding the free space inode's orphan (located in the tree
- * of tree roots).
- * 1 unit for deleting the block group item (located in the extent
- * tree).
- * 1 unit for deleting the free space item (located in tree of tree
- * roots).
- * N units for deleting N device extent items corresponding to each
- * stripe (located in the device tree).
- *
- * In order to remove a block group we also need to reserve units in the
- * system space info in order to update the chunk tree (update one or
- * more device items and remove one chunk item), but this is done at
- * btrfs_remove_chunk() through a call to check_system_chunk().
- */
- map = em->map_lookup;
- num_items = 3 + map->num_stripes;
- free_extent_map(em);
-
- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
- num_items, 1);
-}
-
-/*
- * Process the unused_bgs list and remove any that don't have any allocated
- * space inside of them.
- */
-void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_block_group_cache *block_group;
- struct btrfs_space_info *space_info;
- struct btrfs_trans_handle *trans;
- int ret = 0;
-
- if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
- return;
-
- spin_lock(&fs_info->unused_bgs_lock);
- while (!list_empty(&fs_info->unused_bgs)) {
- u64 start, end;
- int trimming;
-
- block_group = list_first_entry(&fs_info->unused_bgs,
- struct btrfs_block_group_cache,
- bg_list);
- list_del_init(&block_group->bg_list);
-
- space_info = block_group->space_info;
-
- if (ret || btrfs_mixed_space_info(space_info)) {
- btrfs_put_block_group(block_group);
- continue;
- }
- spin_unlock(&fs_info->unused_bgs_lock);
-
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
-
- /* Don't want to race with allocators so take the groups_sem */
- down_write(&space_info->groups_sem);
- spin_lock(&block_group->lock);
- if (block_group->reserved || block_group->pinned ||
- btrfs_block_group_used(&block_group->item) ||
- block_group->ro ||
- list_is_singular(&block_group->list)) {
- /*
- * We want to bail if we made new allocations or have
- * outstanding allocations in this block group. We do
- * the ro check in case balance is currently acting on
- * this block group.
- */
- trace_btrfs_skip_unused_block_group(block_group);
- spin_unlock(&block_group->lock);
- up_write(&space_info->groups_sem);
- goto next;
- }
- spin_unlock(&block_group->lock);
-
- /* We don't want to force the issue, only flip if it's ok. */
- ret = __btrfs_inc_block_group_ro(block_group, 0);
- up_write(&space_info->groups_sem);
- if (ret < 0) {
- ret = 0;
- goto next;
- }
-
- /*
- * Want to do this before we do anything else so we can recover
- * properly if we fail to join the transaction.
- */
- trans = btrfs_start_trans_remove_block_group(fs_info,
- block_group->key.objectid);
- if (IS_ERR(trans)) {
- btrfs_dec_block_group_ro(block_group);
- ret = PTR_ERR(trans);
- goto next;
- }
-
- /*
- * We could have pending pinned extents for this block group,
- * just delete them, we don't care about them anymore.
- */
- start = block_group->key.objectid;
- end = start + block_group->key.offset - 1;
- /*
- * Hold the unused_bg_unpin_mutex lock to avoid racing with
- * btrfs_finish_extent_commit(). If we are at transaction N,
- * another task might be running finish_extent_commit() for the
- * previous transaction N - 1, and have seen a range belonging
- * to the block group in freed_extents[] before we were able to
- * clear the whole block group range from freed_extents[]. This
- * means that task can lookup for the block group after we
- * unpinned it from freed_extents[] and removed it, leading to
- * a BUG_ON() at btrfs_unpin_extent_range().
- */
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
- ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(block_group);
- goto end_trans;
- }
- ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(block_group);
- goto end_trans;
- }
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-
- /* Reset pinned so btrfs_put_block_group doesn't complain */
- spin_lock(&space_info->lock);
- spin_lock(&block_group->lock);
-
- btrfs_space_info_update_bytes_pinned(fs_info, space_info,
- -block_group->pinned);
- space_info->bytes_readonly += block_group->pinned;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -block_group->pinned,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- block_group->pinned = 0;
-
- spin_unlock(&block_group->lock);
- spin_unlock(&space_info->lock);
-
- /* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD);
-
- /* Implicit trim during transaction commit. */
- if (trimming)
- btrfs_get_block_group_trimming(block_group);
-
- /*
- * Btrfs_remove_chunk will abort the transaction if things go
- * horribly wrong.
- */
- ret = btrfs_remove_chunk(trans, block_group->key.objectid);
-
- if (ret) {
- if (trimming)
- btrfs_put_block_group_trimming(block_group);
- goto end_trans;
- }
-
- /*
- * If we're not mounted with -odiscard, we can just forget
- * about this block group. Otherwise we'll need to wait
- * until transaction commit to do the actual discard.
- */
- if (trimming) {
- spin_lock(&fs_info->unused_bgs_lock);
- /*
- * A concurrent scrub might have added us to the list
- * fs_info->unused_bgs, so use a list_move operation
- * to add the block group to the deleted_bgs list.
- */
- list_move(&block_group->bg_list,
- &trans->transaction->deleted_bgs);
- spin_unlock(&fs_info->unused_bgs_lock);
- btrfs_get_block_group(block_group);
- }
-end_trans:
- btrfs_end_transaction(trans);
-next:
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- btrfs_put_block_group(block_group);
- spin_lock(&fs_info->unused_bgs_lock);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
-}
-
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end)
{
@@ -8272,16 +7748,3 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
!atomic_read(&root->will_be_snapshotted));
}
}
-
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
-{
- struct btrfs_fs_info *fs_info = bg->fs_info;
-
- spin_lock(&fs_info->unused_bgs_lock);
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
- trace_btrfs_add_unused_block_group(bg);
- list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
-}