summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_zone_gc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_zone_gc.c')
-rw-r--r--fs/xfs/xfs_zone_gc.c1235
1 files changed, 1235 insertions, 0 deletions
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
new file mode 100644
index 000000000000..c8a1d5c0332c
--- /dev/null
+++ b/fs/xfs/xfs_zone_gc.c
@@ -0,0 +1,1235 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs_platform.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+/*
+ * Implement Garbage Collection (GC) of partially used zoned.
+ *
+ * To support the purely sequential writes in each zone, zoned XFS needs to be
+ * able to move data remaining in a zone out of it to reset the zone to prepare
+ * for writing to it again.
+ *
+ * This is done by the GC thread implemented in this file. To support that a
+ * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
+ * write the garbage collected data into.
+ *
+ * Whenever the available space is below the chosen threshold, the GC thread
+ * looks for potential non-empty but not fully used zones that are worth
+ * reclaiming. Once found the rmap for the victim zone is queried, and after
+ * a bit of sorting to reduce fragmentation, the still live extents are read
+ * into memory and written to the GC target zone, and the bmap btree of the
+ * files is updated to point to the new location. To avoid taking the IOLOCK
+ * and MMAPLOCK for the entire GC process and thus affecting the latency of
+ * user reads and writes to the files, the GC writes are speculative and the
+ * I/O completion checks that no other writes happened for the affected regions
+ * before remapping.
+ *
+ * Once a zone does not contain any valid data, be that through GC or user
+ * block removal, it is queued for for a zone reset. The reset operation
+ * carefully ensures that the RT device cache is flushed and all transactions
+ * referencing the rmap have been committed to disk.
+ */
+
+/*
+ * Size of each GC scratch allocation, and the number of buffers.
+ */
+#define XFS_GC_BUF_SIZE SZ_1M
+#define XFS_GC_NR_BUFS 2
+static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
+
+/*
+ * Chunk that is read and written for each GC operation.
+ *
+ * Note that for writes to actual zoned devices, the chunk can be split when
+ * reaching the hardware limit.
+ */
+struct xfs_gc_bio {
+ struct xfs_zone_gc_data *data;
+
+ /*
+ * Entry into the reading/writing/resetting list. Only accessed from
+ * the GC thread, so no locking needed.
+ */
+ struct list_head entry;
+
+ /*
+ * State of this gc_bio. Done means the current I/O completed.
+ * Set from the bio end I/O handler, read from the GC thread.
+ */
+ enum {
+ XFS_GC_BIO_NEW,
+ XFS_GC_BIO_DONE,
+ } state;
+
+ /*
+ * Pointer to the inode and byte range in the inode that this
+ * GC chunk is operating on.
+ */
+ struct xfs_inode *ip;
+ loff_t offset;
+ unsigned int len;
+
+ /*
+ * Existing startblock (in the zone to be freed) and newly assigned
+ * daddr in the zone GCed into.
+ */
+ xfs_fsblock_t old_startblock;
+ xfs_daddr_t new_daddr;
+
+ /* Are we writing to a sequential write required zone? */
+ bool is_seq;
+
+ /* Open Zone being written to */
+ struct xfs_open_zone *oz;
+
+ struct xfs_rtgroup *victim_rtg;
+
+ /* Bio used for reads and writes, including the bvec used by it */
+ struct bio bio; /* must be last */
+};
+
+#define XFS_ZONE_GC_RECS 1024
+
+/* iterator, needs to be reinitialized for each victim zone */
+struct xfs_zone_gc_iter {
+ struct xfs_rtgroup *victim_rtg;
+ unsigned int rec_count;
+ unsigned int rec_idx;
+ xfs_agblock_t next_startblock;
+ struct xfs_rmap_irec *recs;
+};
+
+/*
+ * Per-mount GC state.
+ */
+struct xfs_zone_gc_data {
+ struct xfs_mount *mp;
+ struct xfs_open_zone *oz;
+
+ /* bioset used to allocate the gc_bios */
+ struct bio_set bio_set;
+
+ /*
+ * Scratchpad to buffer GC data, organized as a ring buffer over
+ * discontiguous folios. scratch_head is where the buffer is filled,
+ * scratch_tail tracks the buffer space freed, and scratch_available
+ * counts the space available in the ring buffer between the head and
+ * the tail.
+ */
+ struct folio *scratch_folios[XFS_GC_NR_BUFS];
+ unsigned int scratch_size;
+ unsigned int scratch_available;
+ unsigned int scratch_head;
+ unsigned int scratch_tail;
+
+ /*
+ * List of bios currently being read, written and reset.
+ * These lists are only accessed by the GC thread itself, and must only
+ * be processed in order.
+ */
+ struct list_head reading;
+ struct list_head writing;
+ struct list_head resetting;
+
+ /*
+ * Iterator for the victim zone.
+ */
+ struct xfs_zone_gc_iter iter;
+};
+
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes. Additionally, the m_zonegc_low_space tunable
+ * can be set to make sure a fraction of the unused blocks are available for
+ * writing.
+ */
+bool
+xfs_zoned_need_gc(
+ struct xfs_mount *mp)
+{
+ s64 available, free, threshold;
+ s32 remainder;
+
+ /* If we have no reclaimable blocks, running GC is useless. */
+ if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
+ return false;
+
+ /*
+ * In order to avoid file fragmentation as much as possible, we should
+ * make sure that we can open enough zones. So trigger GC if the number
+ * of blocks immediately available for writes is lower than the total
+ * number of blocks from all possible open zones.
+ */
+ available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
+ if (available <
+ xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+ return true;
+
+ /*
+ * For cases where the user wants to be more aggressive with GC,
+ * the sysfs attribute zonegc_low_space may be set to a non zero value,
+ * to indicate that GC should try to maintain at least zonegc_low_space
+ * percent of the free space to be directly available for writing. Check
+ * this here.
+ */
+ if (!mp->m_zonegc_low_space)
+ return false;
+
+ free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
+ threshold = div_s64_rem(free, 100, &remainder);
+ threshold = threshold * mp->m_zonegc_low_space +
+ remainder * div_s64(mp->m_zonegc_low_space, 100);
+
+ return available < threshold;
+}
+
+static struct xfs_zone_gc_data *
+xfs_zone_gc_data_alloc(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_gc_data *data;
+ int i;
+
+ data = kzalloc_obj(*data);
+ if (!data)
+ return NULL;
+ data->iter.recs = kzalloc_objs(*data->iter.recs, XFS_ZONE_GC_RECS);
+ if (!data->iter.recs)
+ goto out_free_data;
+
+ if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
+ BIOSET_NEED_BVECS))
+ goto out_free_recs;
+ for (i = 0; i < XFS_GC_NR_BUFS; i++) {
+ data->scratch_folios[i] =
+ folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
+ if (!data->scratch_folios[i])
+ goto out_free_scratch;
+ }
+ data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
+ data->scratch_available = data->scratch_size;
+ INIT_LIST_HEAD(&data->reading);
+ INIT_LIST_HEAD(&data->writing);
+ INIT_LIST_HEAD(&data->resetting);
+ data->mp = mp;
+ return data;
+
+out_free_scratch:
+ while (--i >= 0)
+ folio_put(data->scratch_folios[i]);
+ bioset_exit(&data->bio_set);
+out_free_recs:
+ kfree(data->iter.recs);
+out_free_data:
+ kfree(data);
+ return NULL;
+}
+
+static void
+xfs_zone_gc_data_free(
+ struct xfs_zone_gc_data *data)
+{
+ int i;
+
+ for (i = 0; i < XFS_GC_NR_BUFS; i++)
+ folio_put(data->scratch_folios[i]);
+ bioset_exit(&data->bio_set);
+ kfree(data->iter.recs);
+ kfree(data);
+}
+
+static void
+xfs_zone_gc_iter_init(
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rtgroup *victim_rtg)
+
+{
+ iter->next_startblock = 0;
+ iter->rec_count = 0;
+ iter->rec_idx = 0;
+ iter->victim_rtg = victim_rtg;
+ atomic_inc(&victim_rtg->rtg_gccount);
+}
+
+/*
+ * Query the rmap of the victim zone to gather the records to evacuate.
+ */
+static int
+xfs_zone_gc_query_cb(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec,
+ void *private)
+{
+ struct xfs_zone_gc_iter *iter = private;
+
+ ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
+ ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
+ ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
+
+ iter->recs[iter->rec_count] = *irec;
+ if (++iter->rec_count == XFS_ZONE_GC_RECS) {
+ iter->next_startblock =
+ irec->rm_startblock + irec->rm_blockcount;
+ return 1;
+ }
+ return 0;
+}
+
+static int
+xfs_zone_gc_rmap_rec_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_rmap_irec *reca = a;
+ const struct xfs_rmap_irec *recb = b;
+ int diff;
+
+ diff = cmp_int(reca->rm_owner, recb->rm_owner);
+ if (diff)
+ return diff;
+ return cmp_int(reca->rm_offset, recb->rm_offset);
+}
+
+static int
+xfs_zone_gc_query(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct xfs_rtgroup *rtg = iter->victim_rtg;
+ struct xfs_rmap_irec ri_low = { };
+ struct xfs_rmap_irec ri_high;
+ struct xfs_btree_cur *cur;
+ struct xfs_trans *tp;
+ int error;
+
+ ASSERT(iter->next_startblock <= rtg_blocks(rtg));
+ if (iter->next_startblock == rtg_blocks(rtg))
+ goto done;
+
+ ASSERT(iter->next_startblock < rtg_blocks(rtg));
+ ri_low.rm_startblock = iter->next_startblock;
+ memset(&ri_high, 0xFF, sizeof(ri_high));
+
+ iter->rec_idx = 0;
+ iter->rec_count = 0;
+
+ tp = xfs_trans_alloc_empty(mp);
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+ xfs_zone_gc_query_cb, iter);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_btree_del_cursor(cur, error < 0 ? error : 0);
+ xfs_trans_cancel(tp);
+
+ if (error < 0)
+ return error;
+
+ /*
+ * Sort the rmap records by inode number and increasing offset to
+ * defragment the mappings.
+ *
+ * This could be further enhanced by an even bigger look ahead window,
+ * but that's better left until we have better detection of changes to
+ * inode mapping to avoid the potential of GCing already dead data.
+ */
+ sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
+ xfs_zone_gc_rmap_rec_cmp, NULL);
+
+ if (error == 0) {
+ /*
+ * We finished iterating through the zone.
+ */
+ iter->next_startblock = rtg_blocks(rtg);
+ if (iter->rec_count == 0)
+ goto done;
+ }
+
+ return 0;
+done:
+ atomic_dec(&iter->victim_rtg->rtg_gccount);
+ xfs_rtgroup_rele(iter->victim_rtg);
+ iter->victim_rtg = NULL;
+ return 0;
+}
+
+static bool
+xfs_zone_gc_iter_irec(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rmap_irec *chunk_rec,
+ struct xfs_inode **ipp)
+{
+ struct xfs_rmap_irec *irec;
+ int error;
+
+retry:
+ if (iter->rec_idx == iter->rec_count) {
+ error = xfs_zone_gc_query(mp, iter);
+ if (error)
+ goto fail;
+ if (!iter->victim_rtg)
+ return false;
+ }
+
+ irec = &iter->recs[iter->rec_idx];
+ error = xfs_iget(mp, NULL, irec->rm_owner,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
+ if (error) {
+ /*
+ * If the inode was already deleted, skip over it.
+ */
+ if (error == -ENOENT) {
+ iter->rec_idx++;
+ goto retry;
+ }
+ goto fail;
+ }
+
+ if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
+ iter->rec_idx++;
+ xfs_irele(*ipp);
+ goto retry;
+ }
+
+ *chunk_rec = *irec;
+ return true;
+
+fail:
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ return false;
+}
+
+static void
+xfs_zone_gc_iter_advance(
+ struct xfs_zone_gc_iter *iter,
+ xfs_extlen_t count_fsb)
+{
+ struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
+
+ irec->rm_offset += count_fsb;
+ irec->rm_startblock += count_fsb;
+ irec->rm_blockcount -= count_fsb;
+ if (!irec->rm_blockcount)
+ iter->rec_idx++;
+}
+
+static struct xfs_rtgroup *
+xfs_zone_gc_pick_victim_from(
+ struct xfs_mount *mp,
+ uint32_t bucket)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint32_t victim_used = U32_MAX;
+ struct xfs_rtgroup *victim_rtg = NULL;
+ uint32_t bit;
+
+ if (!zi->zi_used_bucket_entries[bucket])
+ return NULL;
+
+ for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
+ mp->m_sb.sb_rgcount) {
+ struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
+
+ if (!rtg)
+ continue;
+
+ /*
+ * If the zone is already undergoing GC, don't pick it again.
+ *
+ * This prevents us from picking one of the zones for which we
+ * already submitted GC I/O, but for which the remapping hasn't
+ * concluded yet. This won't cause data corruption, but
+ * increases write amplification and slows down GC, so this is
+ * a bad thing.
+ */
+ if (atomic_read(&rtg->rtg_gccount)) {
+ xfs_rtgroup_rele(rtg);
+ continue;
+ }
+
+ /* skip zones that are just waiting for a reset */
+ if (rtg_rmap(rtg)->i_used_blocks == 0 ||
+ rtg_rmap(rtg)->i_used_blocks >= victim_used) {
+ xfs_rtgroup_rele(rtg);
+ continue;
+ }
+
+ if (victim_rtg)
+ xfs_rtgroup_rele(victim_rtg);
+ victim_rtg = rtg;
+ victim_used = rtg_rmap(rtg)->i_used_blocks;
+
+ /*
+ * Any zone that is less than 1 percent used is fair game for
+ * instant reclaim. All of these zones are in the last
+ * bucket, so avoid the expensive division for the zones
+ * in the other buckets.
+ */
+ if (bucket == 0 &&
+ rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
+ break;
+ }
+
+ return victim_rtg;
+}
+
+/*
+ * Iterate through all zones marked as reclaimable and find a candidate to
+ * reclaim.
+ */
+static bool
+xfs_zone_gc_select_victim(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_gc_iter *iter = &data->iter;
+ struct xfs_mount *mp = data->mp;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_rtgroup *victim_rtg = NULL;
+ unsigned int bucket;
+
+ spin_lock(&zi->zi_used_buckets_lock);
+ for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
+ victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
+ if (victim_rtg)
+ break;
+ }
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ if (!victim_rtg)
+ return false;
+
+ trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
+ xfs_zone_gc_iter_init(iter, victim_rtg);
+ return true;
+}
+
+static int
+xfs_zone_gc_steal_open_zone(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_info *zi = data->mp->m_zone_info;
+ struct xfs_open_zone *oz, *found = NULL;
+
+ spin_lock(&zi->zi_open_zones_lock);
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
+ if (!found || oz->oz_allocated < found->oz_allocated)
+ found = oz;
+ }
+ if (!found) {
+ spin_unlock(&zi->zi_open_zones_lock);
+ return -EIO;
+ }
+
+ trace_xfs_zone_gc_target_stolen(found->oz_rtg);
+ found->oz_is_gc = true;
+ zi->zi_nr_open_zones--;
+ zi->zi_nr_open_gc_zones++;
+ spin_unlock(&zi->zi_open_zones_lock);
+
+ atomic_inc(&found->oz_ref);
+ data->oz = found;
+ return 0;
+}
+
+/*
+ * Ensure we have a valid open zone to write to.
+ */
+static bool
+xfs_zone_gc_select_target(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_info *zi = data->mp->m_zone_info;
+
+ if (data->oz) {
+ /*
+ * If we have space available, just keep using the existing
+ * zone.
+ */
+ if (data->oz->oz_allocated < rtg_blocks(data->oz->oz_rtg))
+ return true;
+
+ /*
+ * Wait for all writes to the current zone to finish before
+ * picking a new one.
+ */
+ if (data->oz->oz_written < rtg_blocks(data->oz->oz_rtg))
+ return false;
+
+ xfs_open_zone_put(data->oz);
+ }
+
+ /*
+ * Open a new zone when there is none currently in use.
+ */
+ ASSERT(zi->zi_nr_open_zones <=
+ data->mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
+ data->oz = xfs_open_zone(data->mp, WRITE_LIFE_NOT_SET, true);
+ if (!data->oz)
+ return false;
+ trace_xfs_zone_gc_target_opened(data->oz->oz_rtg);
+ atomic_inc(&data->oz->oz_ref);
+ spin_lock(&zi->zi_open_zones_lock);
+ zi->zi_nr_open_gc_zones++;
+ list_add_tail(&data->oz->oz_entry, &zi->zi_open_zones);
+ spin_unlock(&zi->zi_open_zones_lock);
+ return true;
+}
+
+static void
+xfs_zone_gc_end_io(
+ struct bio *bio)
+{
+ struct xfs_gc_bio *chunk =
+ container_of(bio, struct xfs_gc_bio, bio);
+ struct xfs_zone_gc_data *data = chunk->data;
+
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
+ wake_up_process(data->mp->m_zone_info->zi_gc_thread);
+}
+
+static bool
+xfs_zone_gc_alloc_blocks(
+ struct xfs_zone_gc_data *data,
+ xfs_extlen_t *count_fsb,
+ xfs_daddr_t *daddr,
+ bool *is_seq)
+{
+ struct xfs_mount *mp = data->mp;
+ struct xfs_open_zone *oz = data->oz;
+
+ *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available));
+
+ /*
+ * Directly allocate GC blocks from the reserved pool.
+ *
+ * If we'd take them from the normal pool we could be stealing blocks
+ * from a regular writer, which would then have to wait for GC and
+ * deadlock.
+ */
+ spin_lock(&mp->m_sb_lock);
+ *count_fsb = min(*count_fsb,
+ rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
+ *count_fsb = min3(*count_fsb,
+ mp->m_free[XC_FREE_RTEXTENTS].res_avail,
+ mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
+ mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
+ mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (!*count_fsb)
+ return false;
+
+ *daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0);
+ *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
+ if (!*is_seq)
+ *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
+ oz->oz_allocated += *count_fsb;
+ atomic_inc(&oz->oz_ref);
+ return true;
+}
+
+static void
+xfs_zone_gc_add_data(
+ struct xfs_gc_bio *chunk)
+{
+ struct xfs_zone_gc_data *data = chunk->data;
+ unsigned int len = chunk->len;
+ unsigned int off = data->scratch_head;
+
+ do {
+ unsigned int this_off = off % XFS_GC_BUF_SIZE;
+ unsigned int this_len = min(len, XFS_GC_BUF_SIZE - this_off);
+
+ bio_add_folio_nofail(&chunk->bio,
+ data->scratch_folios[off / XFS_GC_BUF_SIZE],
+ this_len, this_off);
+ len -= this_len;
+ off += this_len;
+ if (off == data->scratch_size)
+ off = 0;
+ } while (len);
+}
+
+static bool
+xfs_zone_gc_can_start_chunk(
+ struct xfs_zone_gc_data *data)
+{
+
+ if (xfs_is_shutdown(data->mp))
+ return false;
+ if (!data->scratch_available)
+ return false;
+
+ if (!data->iter.victim_rtg) {
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+ if (!xfs_zoned_need_gc(data->mp))
+ return false;
+ if (!xfs_zone_gc_select_victim(data))
+ return false;
+ }
+
+ return xfs_zone_gc_select_target(data);
+}
+
+static bool
+xfs_zone_gc_start_chunk(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_gc_iter *iter = &data->iter;
+ struct xfs_mount *mp = data->mp;
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ struct xfs_rmap_irec irec;
+ struct xfs_gc_bio *chunk;
+ struct xfs_inode *ip;
+ struct bio *bio;
+ xfs_daddr_t daddr;
+ bool is_seq;
+
+ if (!xfs_zone_gc_can_start_chunk(data))
+ return false;
+
+ set_current_state(TASK_RUNNING);
+ if (!xfs_zone_gc_iter_irec(mp, iter, &irec, &ip))
+ return false;
+
+ if (!xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
+ &is_seq)) {
+ xfs_irele(ip);
+ return false;
+ }
+
+ /*
+ * Scratch allocation can wrap around to the same buffer again,
+ * provision an extra bvec for that case.
+ */
+ bio = bio_alloc_bioset(bdev, XFS_GC_NR_BUFS + 1, REQ_OP_READ, GFP_NOFS,
+ &data->bio_set);
+ chunk = container_of(bio, struct xfs_gc_bio, bio);
+ chunk->ip = ip;
+ chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
+ chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+ chunk->old_startblock =
+ xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
+ chunk->new_daddr = daddr;
+ chunk->is_seq = is_seq;
+ chunk->data = data;
+ chunk->oz = data->oz;
+ chunk->victim_rtg = iter->victim_rtg;
+ atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
+
+ bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
+ bio->bi_end_io = xfs_zone_gc_end_io;
+ xfs_zone_gc_add_data(chunk);
+ data->scratch_head =
+ (data->scratch_head + chunk->len) % data->scratch_size;
+ data->scratch_available -= chunk->len;
+
+ XFS_STATS_INC(mp, xs_gc_read_calls);
+
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&chunk->entry, &data->reading);
+ xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
+
+ submit_bio(bio);
+ return true;
+}
+
+static void
+xfs_zone_gc_free_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ atomic_dec(&chunk->victim_rtg->rtg_gccount);
+ xfs_rtgroup_rele(chunk->victim_rtg);
+ list_del(&chunk->entry);
+ xfs_open_zone_put(chunk->oz);
+ xfs_irele(chunk->ip);
+ bio_put(&chunk->bio);
+}
+
+static void
+xfs_zone_gc_submit_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ if (chunk->is_seq) {
+ chunk->bio.bi_opf &= ~REQ_OP_WRITE;
+ chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+ chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
+ chunk->bio.bi_end_io = xfs_zone_gc_end_io;
+ submit_bio(&chunk->bio);
+}
+
+static struct xfs_gc_bio *
+xfs_zone_gc_split_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ struct queue_limits *lim =
+ &bdev_get_queue(chunk->bio.bi_bdev)->limits;
+ struct xfs_gc_bio *split_chunk;
+ int split_sectors;
+ unsigned int split_len;
+ struct bio *split;
+ unsigned int nsegs;
+
+ if (!chunk->is_seq)
+ return NULL;
+
+ split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+ if (!split_sectors)
+ return NULL;
+
+ /* ensure the split chunk is still block size aligned */
+ split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
+ data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
+ split_len = split_sectors << SECTOR_SHIFT;
+
+ split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
+ split_chunk = container_of(split, struct xfs_gc_bio, bio);
+ split_chunk->data = data;
+ ihold(VFS_I(chunk->ip));
+ split_chunk->ip = chunk->ip;
+ split_chunk->is_seq = chunk->is_seq;
+ split_chunk->offset = chunk->offset;
+ split_chunk->len = split_len;
+ split_chunk->old_startblock = chunk->old_startblock;
+ split_chunk->new_daddr = chunk->new_daddr;
+ split_chunk->oz = chunk->oz;
+ atomic_inc(&chunk->oz->oz_ref);
+
+ split_chunk->victim_rtg = chunk->victim_rtg;
+ atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
+
+ chunk->offset += split_len;
+ chunk->len -= split_len;
+ chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
+
+ /* add right before the original chunk */
+ WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&split_chunk->entry, &chunk->entry);
+ return split_chunk;
+}
+
+static void
+xfs_zone_gc_write_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ struct xfs_zone_gc_data *data = chunk->data;
+ struct xfs_mount *mp = chunk->ip->i_mount;
+ struct xfs_gc_bio *split_chunk;
+
+ if (chunk->bio.bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(chunk);
+ return;
+ }
+
+ XFS_STATS_INC(mp, xs_gc_write_calls);
+ XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len);
+
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_move_tail(&chunk->entry, &data->writing);
+
+ bio_reuse(&chunk->bio, REQ_OP_WRITE);
+ while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
+ xfs_zone_gc_submit_write(data, split_chunk);
+ xfs_zone_gc_submit_write(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ struct xfs_zone_gc_data *data = chunk->data;
+ struct xfs_inode *ip = chunk->ip;
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ if (chunk->bio.bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(chunk);
+ return;
+ }
+
+ data->scratch_tail =
+ (data->scratch_tail + chunk->len) % data->scratch_size;
+ data->scratch_available += chunk->len;
+
+ /*
+ * Cycle through the iolock and wait for direct I/O and layouts to
+ * ensure no one is reading from the old mapping before it goes away.
+ *
+ * Note that xfs_zoned_end_io() below checks that no other writer raced
+ * with us to update the mapping by checking that the old startblock
+ * didn't change.
+ */
+ xfs_ilock(ip, iolock);
+ error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
+ if (!error)
+ inode_dio_wait(VFS_I(ip));
+ xfs_iunlock(ip, iolock);
+ if (error)
+ goto free;
+
+ if (chunk->is_seq)
+ chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
+ error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
+ chunk->new_daddr, chunk->oz, chunk->old_startblock);
+free:
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ xfs_zone_gc_free_chunk(chunk);
+}
+
+static void
+xfs_zone_gc_finish_reset(
+ struct xfs_gc_bio *chunk)
+{
+ struct xfs_rtgroup *rtg = chunk->bio.bi_private;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ if (chunk->bio.bi_status) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ goto out;
+ }
+
+ xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE);
+ atomic_inc(&zi->zi_nr_free_zones);
+
+ xfs_zoned_add_available(mp, rtg_blocks(rtg));
+
+ wake_up_all(&zi->zi_zone_wait);
+out:
+ list_del(&chunk->entry);
+ bio_put(&chunk->bio);
+}
+
+static void
+xfs_submit_zone_reset_bio(
+ struct bio *bio,
+ void *priv)
+{
+ struct xfs_rtgroup *rtg = priv;
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ trace_xfs_zone_reset(rtg);
+
+ ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
+
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ XFS_STATS_INC(mp, xs_gc_zone_reset_calls);
+
+ bio->bi_iter.bi_sector = xfs_gbno_to_daddr(rtg_group(rtg), 0);
+ if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
+ /*
+ * Also use the bio to drive the state machine when neither
+ * zone reset nor discard is supported to keep things simple.
+ */
+ if (!bdev_max_discard_sectors(bio->bi_bdev)) {
+ bio_endio(bio);
+ return;
+ }
+ bio->bi_opf &= ~REQ_OP_ZONE_RESET;
+ bio->bi_opf |= REQ_OP_DISCARD;
+ bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg));
+ }
+
+ submit_bio(bio);
+}
+
+int
+xfs_zone_gc_reset_sync(
+ struct xfs_rtgroup *rtg)
+{
+ struct bio bio;
+ int error;
+
+ bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
+ REQ_OP_ZONE_RESET | REQ_SYNC);
+ bio_await(&bio, rtg, xfs_submit_zone_reset_bio);
+ error = blk_status_to_errno(bio.bi_status);
+ bio_uninit(&bio);
+ return error;
+}
+
+static void
+xfs_zone_gc_reset_zones(
+ struct xfs_zone_gc_data *data,
+ struct xfs_group *reset_list)
+{
+ struct xfs_group *next = reset_list;
+
+ if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
+ xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
+ return;
+ }
+
+ do {
+ struct xfs_rtgroup *rtg = to_rtg(next);
+ struct xfs_gc_bio *chunk;
+ struct bio *bio;
+
+ xfs_log_force_inode(rtg_rmap(rtg));
+
+ next = rtg_group(rtg)->xg_next_reset;
+ rtg_group(rtg)->xg_next_reset = NULL;
+
+ bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
+ 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
+ bio->bi_private = rtg;
+ bio->bi_end_io = xfs_zone_gc_end_io;
+
+ chunk = container_of(bio, struct xfs_gc_bio, bio);
+ chunk->data = data;
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&chunk->entry, &data->resetting);
+ xfs_submit_zone_reset_bio(bio, rtg);
+ } while (next);
+}
+
+/*
+ * Handle the work to read and write data for GC and to reset the zones,
+ * including handling all completions.
+ *
+ * Note that the order of the chunks is preserved so that we don't undo the
+ * optimal order established by xfs_zone_gc_query().
+ */
+static void
+xfs_zone_gc_handle_work(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_info *zi = data->mp->m_zone_info;
+ struct xfs_gc_bio *chunk, *next;
+ struct xfs_group *reset_list;
+ struct blk_plug plug;
+
+ spin_lock(&zi->zi_reset_list_lock);
+ reset_list = zi->zi_reset_list;
+ zi->zi_reset_list = NULL;
+ spin_unlock(&zi->zi_reset_list_lock);
+
+ if (reset_list) {
+ set_current_state(TASK_RUNNING);
+ xfs_zone_gc_reset_zones(data, reset_list);
+ }
+
+ list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ set_current_state(TASK_RUNNING);
+ xfs_zone_gc_finish_reset(chunk);
+ }
+
+ list_for_each_entry_safe(chunk, next, &data->writing, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ set_current_state(TASK_RUNNING);
+ xfs_zone_gc_finish_chunk(chunk);
+ }
+
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(chunk, next, &data->reading, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ set_current_state(TASK_RUNNING);
+ xfs_zone_gc_write_chunk(chunk);
+ }
+ blk_finish_plug(&plug);
+
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data))
+ ;
+ blk_finish_plug(&plug);
+}
+
+/*
+ * Note that the current GC algorithm would break reflinks and thus duplicate
+ * data that was shared by multiple owners before. Because of that reflinks
+ * are currently not supported on zoned file systems and can't be created or
+ * mounted.
+ */
+static int
+xfs_zoned_gcd(
+ void *private)
+{
+ struct xfs_zone_gc_data *data = private;
+ struct xfs_mount *mp = data->mp;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ unsigned int nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
+ set_freezable();
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+ xfs_set_zonegc_running(mp);
+
+ xfs_zone_gc_handle_work(data);
+
+ /*
+ * Only sleep if nothing set the state to running. Else check for
+ * work again as someone might have queued up more work and woken
+ * us in the meantime.
+ */
+ if (get_current_state() == TASK_RUNNING) {
+ try_to_freeze();
+ continue;
+ }
+
+ if (list_empty(&data->reading) &&
+ list_empty(&data->writing) &&
+ list_empty(&data->resetting) &&
+ !zi->zi_reset_list) {
+ xfs_clear_zonegc_running(mp);
+ xfs_zoned_resv_wake_all(mp);
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ kthread_parkme();
+ continue;
+ }
+ }
+
+ schedule();
+ }
+ xfs_clear_zonegc_running(mp);
+
+ if (data->oz)
+ xfs_open_zone_put(data->oz);
+ if (data->iter.victim_rtg)
+ xfs_rtgroup_rele(data->iter.victim_rtg);
+
+ memalloc_nofs_restore(nofs_flag);
+ xfs_zone_gc_data_free(data);
+ return 0;
+}
+
+void
+xfs_zone_gc_start(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_unpark(mp->m_zone_info->zi_gc_thread);
+}
+
+void
+xfs_zone_gc_stop(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_park(mp->m_zone_info->zi_gc_thread);
+}
+
+void
+xfs_zone_gc_wakeup(
+ struct xfs_mount *mp)
+{
+ struct super_block *sb = mp->m_super;
+
+ /*
+ * If we are unmounting the file system we must not try to
+ * wake gc as m_zone_info might have been freed already.
+ */
+ if (down_read_trylock(&sb->s_umount)) {
+ if (!xfs_is_readonly(mp))
+ wake_up_process(mp->m_zone_info->zi_gc_thread);
+ up_read(&sb->s_umount);
+ }
+}
+
+int
+xfs_zone_gc_mount(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_gc_data *data;
+ int error;
+
+ data = xfs_zone_gc_data_alloc(mp);
+ if (!data)
+ return -ENOMEM;
+
+ /*
+ * If there are no free zones available for GC, or the number of open
+ * zones has reached the open zone limit, pick the open zone with
+ * the least used space to GC into. This should only happen after an
+ * unclean shutdown while GC was ongoing. Otherwise a GC zone will
+ * be selected from the free zone pool on demand.
+ */
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
+ zi->zi_nr_open_zones >= mp->m_max_open_zones) {
+ error = xfs_zone_gc_steal_open_zone(data);
+ if (error) {
+ xfs_warn(mp, "unable to steal an open zone for gc");
+ goto out_free_gc_data;
+ }
+ }
+
+ zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
+ "xfs-zone-gc/%s", mp->m_super->s_id);
+ if (IS_ERR(zi->zi_gc_thread)) {
+ xfs_warn(mp, "unable to create zone gc thread");
+ error = PTR_ERR(zi->zi_gc_thread);
+ goto out_put_oz;
+ }
+
+ /* xfs_zone_gc_start will unpark for rw mounts */
+ kthread_park(zi->zi_gc_thread);
+ return 0;
+
+out_put_oz:
+ if (data->oz)
+ xfs_open_zone_put(data->oz);
+out_free_gc_data:
+ xfs_zone_gc_data_free(data);
+ return error;
+}
+
+void
+xfs_zone_gc_unmount(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ kthread_stop(zi->zi_gc_thread);
+}