summaryrefslogtreecommitdiff
path: root/fs/btrfs/block-group.c
diff options
context:
space:
mode:
authorDennis Zhou <dennis@kernel.org>2019-12-13 16:22:14 -0800
committerDavid Sterba <dsterba@suse.com>2020-01-20 16:40:57 +0100
commitb0643e59cfa609c4b5f246f2b2c33b078f87e9d9 (patch)
tree04ff6c50b7ea0f5284b1237d198210a69d0e71f6 /fs/btrfs/block-group.c
parentda080fe1bad4777b02f6a3db42823a8797aadbca (diff)
downloadlwn-b0643e59cfa609c4b5f246f2b2c33b078f87e9d9.tar.gz
lwn-b0643e59cfa609c4b5f246f2b2c33b078f87e9d9.zip
btrfs: add the beginning of async discard, discard workqueue
When discard is enabled, everytime a pinned extent is released back to the block_group's free space cache, a discard is issued for the extent. This is an overeager approach when it comes to discarding and helping the SSD maintain enough free space to prevent severe garbage collection situations. This adds the beginning of async discard. Instead of issuing a discard prior to returning it to the free space, it is just marked as untrimmed. The block_group is then added to a LRU which then feeds into a workqueue to issue discards at a much slower rate. Full discarding of unused block groups is still done and will be addressed in a future patch of the series. For now, we don't persist the discard state of extents and bitmaps. Therefore, our failure recovery mode will be to consider extents untrimmed. This lets us handle failure and unmounting as one in the same. On a number of Facebook webservers, I collected data every minute accounting the time we spent in btrfs_finish_extent_commit() (col. 1) and in btrfs_commit_transaction() (col. 2). btrfs_finish_extent_commit() is where we discard extents synchronously before returning them to the free space cache. discard=sync: p99 total per minute p99 total per minute Drive | extent_commit() (ms) | commit_trans() (ms) --------------------------------------------------------------- Drive A | 434 | 1170 Drive B | 880 | 2330 Drive C | 2943 | 3920 Drive D | 4763 | 5701 discard=async: p99 total per minute p99 total per minute Drive | extent_commit() (ms) | commit_trans() (ms) -------------------------------------------------------------- Drive A | 134 | 956 Drive B | 64 | 1972 Drive C | 59 | 1032 Drive D | 62 | 1200 While it's not great that the stats are cumulative over 1m, all of these servers are running the same workload and and the delta between the two are substantial. We are spending significantly less time in btrfs_finish_extent_commit() which is responsible for discarding. Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Dennis Zhou <dennis@kernel.org> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/block-group.c')
-rw-r--r--fs/btrfs/block-group.c37
1 files changed, 34 insertions, 3 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index be1938dc94fd..6ba15c45e779 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -14,6 +14,7 @@
#include "sysfs.h"
#include "tree-log.h"
#include "delalloc-space.h"
+#include "discard.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -132,6 +133,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
WARN_ON(cache->reserved > 0);
/*
+ * A block_group shouldn't be on the discard_list anymore.
+ * Remove the block_group from the discard_list to prevent us
+ * from causing a panic due to NULL pointer dereference.
+ */
+ if (WARN_ON(!list_empty(&cache->discard_list)))
+ btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
+ cache);
+
+ /*
* If not empty, someone is still holding mutex of
* full_stripe_lock, which can only be released by caller.
* And it will definitely cause use-after-free when caller
@@ -466,8 +476,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
total_added += size;
- ret = btrfs_add_free_space(block_group, start,
- size);
+ ret = btrfs_add_free_space_async_trimmed(block_group,
+ start, size);
BUG_ON(ret); /* -ENOMEM or logic error */
start = extent_end + 1;
} else {
@@ -478,7 +488,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
if (start < end) {
size = end - start;
total_added += size;
- ret = btrfs_add_free_space(block_group, start, size);
+ ret = btrfs_add_free_space_async_trimmed(block_group, start,
+ size);
BUG_ON(ret); /* -ENOMEM or logic error */
}
@@ -1258,6 +1269,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+
mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
@@ -1333,6 +1346,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ /*
+ * At this point, the block_group is read only and should fail
+ * new allocations. However, btrfs_finish_extent_commit() can
+ * cause this block_group to be placed back on the discard
+ * lists because now the block_group isn't fully discarded.
+ * Bail here and try again later after discarding everything.
+ */
+ spin_lock(&fs_info->discard_ctl.lock);
+ if (!list_empty(&block_group->discard_list)) {
+ spin_unlock(&fs_info->discard_ctl.lock);
+ btrfs_dec_block_group_ro(block_group);
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ block_group);
+ goto end_trans;
+ }
+ spin_unlock(&fs_info->discard_ctl.lock);
+
/* Reset pinned so btrfs_put_block_group doesn't complain */
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
@@ -1603,6 +1633,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);