diff options
Diffstat (limited to 'fs/xfs/xfs_zone_gc.c')
| -rw-r--r-- | fs/xfs/xfs_zone_gc.c | 1235 |
1 files changed, 1235 insertions, 0 deletions
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c new file mode 100644 index 000000000000..c8a1d5c0332c --- /dev/null +++ b/fs/xfs/xfs_zone_gc.c @@ -0,0 +1,1235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs_platform.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" + +/* + * Implement Garbage Collection (GC) of partially used zoned. + * + * To support the purely sequential writes in each zone, zoned XFS needs to be + * able to move data remaining in a zone out of it to reset the zone to prepare + * for writing to it again. + * + * This is done by the GC thread implemented in this file. To support that a + * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to + * write the garbage collected data into. + * + * Whenever the available space is below the chosen threshold, the GC thread + * looks for potential non-empty but not fully used zones that are worth + * reclaiming. Once found the rmap for the victim zone is queried, and after + * a bit of sorting to reduce fragmentation, the still live extents are read + * into memory and written to the GC target zone, and the bmap btree of the + * files is updated to point to the new location. To avoid taking the IOLOCK + * and MMAPLOCK for the entire GC process and thus affecting the latency of + * user reads and writes to the files, the GC writes are speculative and the + * I/O completion checks that no other writes happened for the affected regions + * before remapping. + * + * Once a zone does not contain any valid data, be that through GC or user + * block removal, it is queued for for a zone reset. The reset operation + * carefully ensures that the RT device cache is flushed and all transactions + * referencing the rmap have been committed to disk. + */ + +/* + * Size of each GC scratch allocation, and the number of buffers. + */ +#define XFS_GC_BUF_SIZE SZ_1M +#define XFS_GC_NR_BUFS 2 +static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS); + +/* + * Chunk that is read and written for each GC operation. + * + * Note that for writes to actual zoned devices, the chunk can be split when + * reaching the hardware limit. + */ +struct xfs_gc_bio { + struct xfs_zone_gc_data *data; + + /* + * Entry into the reading/writing/resetting list. Only accessed from + * the GC thread, so no locking needed. + */ + struct list_head entry; + + /* + * State of this gc_bio. Done means the current I/O completed. + * Set from the bio end I/O handler, read from the GC thread. + */ + enum { + XFS_GC_BIO_NEW, + XFS_GC_BIO_DONE, + } state; + + /* + * Pointer to the inode and byte range in the inode that this + * GC chunk is operating on. + */ + struct xfs_inode *ip; + loff_t offset; + unsigned int len; + + /* + * Existing startblock (in the zone to be freed) and newly assigned + * daddr in the zone GCed into. + */ + xfs_fsblock_t old_startblock; + xfs_daddr_t new_daddr; + + /* Are we writing to a sequential write required zone? */ + bool is_seq; + + /* Open Zone being written to */ + struct xfs_open_zone *oz; + + struct xfs_rtgroup *victim_rtg; + + /* Bio used for reads and writes, including the bvec used by it */ + struct bio bio; /* must be last */ +}; + +#define XFS_ZONE_GC_RECS 1024 + +/* iterator, needs to be reinitialized for each victim zone */ +struct xfs_zone_gc_iter { + struct xfs_rtgroup *victim_rtg; + unsigned int rec_count; + unsigned int rec_idx; + xfs_agblock_t next_startblock; + struct xfs_rmap_irec *recs; +}; + +/* + * Per-mount GC state. + */ +struct xfs_zone_gc_data { + struct xfs_mount *mp; + struct xfs_open_zone *oz; + + /* bioset used to allocate the gc_bios */ + struct bio_set bio_set; + + /* + * Scratchpad to buffer GC data, organized as a ring buffer over + * discontiguous folios. scratch_head is where the buffer is filled, + * scratch_tail tracks the buffer space freed, and scratch_available + * counts the space available in the ring buffer between the head and + * the tail. + */ + struct folio *scratch_folios[XFS_GC_NR_BUFS]; + unsigned int scratch_size; + unsigned int scratch_available; + unsigned int scratch_head; + unsigned int scratch_tail; + + /* + * List of bios currently being read, written and reset. + * These lists are only accessed by the GC thread itself, and must only + * be processed in order. + */ + struct list_head reading; + struct list_head writing; + struct list_head resetting; + + /* + * Iterator for the victim zone. + */ + struct xfs_zone_gc_iter iter; +}; + +/* + * We aim to keep enough zones free in stock to fully use the open zone limit + * for data placement purposes. Additionally, the m_zonegc_low_space tunable + * can be set to make sure a fraction of the unused blocks are available for + * writing. + */ +bool +xfs_zoned_need_gc( + struct xfs_mount *mp) +{ + s64 available, free, threshold; + s32 remainder; + + /* If we have no reclaimable blocks, running GC is useless. */ + if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) + return false; + + /* + * In order to avoid file fragmentation as much as possible, we should + * make sure that we can open enough zones. So trigger GC if the number + * of blocks immediately available for writes is lower than the total + * number of blocks from all possible open zones. + */ + available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); + if (available < + xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + return true; + + /* + * For cases where the user wants to be more aggressive with GC, + * the sysfs attribute zonegc_low_space may be set to a non zero value, + * to indicate that GC should try to maintain at least zonegc_low_space + * percent of the free space to be directly available for writing. Check + * this here. + */ + if (!mp->m_zonegc_low_space) + return false; + + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); + threshold = div_s64_rem(free, 100, &remainder); + threshold = threshold * mp->m_zonegc_low_space + + remainder * div_s64(mp->m_zonegc_low_space, 100); + + return available < threshold; +} + +static struct xfs_zone_gc_data * +xfs_zone_gc_data_alloc( + struct xfs_mount *mp) +{ + struct xfs_zone_gc_data *data; + int i; + + data = kzalloc_obj(*data); + if (!data) + return NULL; + data->iter.recs = kzalloc_objs(*data->iter.recs, XFS_ZONE_GC_RECS); + if (!data->iter.recs) + goto out_free_data; + + if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_recs; + for (i = 0; i < XFS_GC_NR_BUFS; i++) { + data->scratch_folios[i] = + folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE)); + if (!data->scratch_folios[i]) + goto out_free_scratch; + } + data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS; + data->scratch_available = data->scratch_size; + INIT_LIST_HEAD(&data->reading); + INIT_LIST_HEAD(&data->writing); + INIT_LIST_HEAD(&data->resetting); + data->mp = mp; + return data; + +out_free_scratch: + while (--i >= 0) + folio_put(data->scratch_folios[i]); + bioset_exit(&data->bio_set); +out_free_recs: + kfree(data->iter.recs); +out_free_data: + kfree(data); + return NULL; +} + +static void +xfs_zone_gc_data_free( + struct xfs_zone_gc_data *data) +{ + int i; + + for (i = 0; i < XFS_GC_NR_BUFS; i++) + folio_put(data->scratch_folios[i]); + bioset_exit(&data->bio_set); + kfree(data->iter.recs); + kfree(data); +} + +static void +xfs_zone_gc_iter_init( + struct xfs_zone_gc_iter *iter, + struct xfs_rtgroup *victim_rtg) + +{ + iter->next_startblock = 0; + iter->rec_count = 0; + iter->rec_idx = 0; + iter->victim_rtg = victim_rtg; + atomic_inc(&victim_rtg->rtg_gccount); +} + +/* + * Query the rmap of the victim zone to gather the records to evacuate. + */ +static int +xfs_zone_gc_query_cb( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec, + void *private) +{ + struct xfs_zone_gc_iter *iter = private; + + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); + + iter->recs[iter->rec_count] = *irec; + if (++iter->rec_count == XFS_ZONE_GC_RECS) { + iter->next_startblock = + irec->rm_startblock + irec->rm_blockcount; + return 1; + } + return 0; +} + +static int +xfs_zone_gc_rmap_rec_cmp( + const void *a, + const void *b) +{ + const struct xfs_rmap_irec *reca = a; + const struct xfs_rmap_irec *recb = b; + int diff; + + diff = cmp_int(reca->rm_owner, recb->rm_owner); + if (diff) + return diff; + return cmp_int(reca->rm_offset, recb->rm_offset); +} + +static int +xfs_zone_gc_query( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter) +{ + struct xfs_rtgroup *rtg = iter->victim_rtg; + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_btree_cur *cur; + struct xfs_trans *tp; + int error; + + ASSERT(iter->next_startblock <= rtg_blocks(rtg)); + if (iter->next_startblock == rtg_blocks(rtg)) + goto done; + + ASSERT(iter->next_startblock < rtg_blocks(rtg)); + ri_low.rm_startblock = iter->next_startblock; + memset(&ri_high, 0xFF, sizeof(ri_high)); + + iter->rec_idx = 0; + iter->rec_count = 0; + + tp = xfs_trans_alloc_empty(mp); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_zone_gc_query_cb, iter); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + xfs_btree_del_cursor(cur, error < 0 ? error : 0); + xfs_trans_cancel(tp); + + if (error < 0) + return error; + + /* + * Sort the rmap records by inode number and increasing offset to + * defragment the mappings. + * + * This could be further enhanced by an even bigger look ahead window, + * but that's better left until we have better detection of changes to + * inode mapping to avoid the potential of GCing already dead data. + */ + sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), + xfs_zone_gc_rmap_rec_cmp, NULL); + + if (error == 0) { + /* + * We finished iterating through the zone. + */ + iter->next_startblock = rtg_blocks(rtg); + if (iter->rec_count == 0) + goto done; + } + + return 0; +done: + atomic_dec(&iter->victim_rtg->rtg_gccount); + xfs_rtgroup_rele(iter->victim_rtg); + iter->victim_rtg = NULL; + return 0; +} + +static bool +xfs_zone_gc_iter_irec( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter, + struct xfs_rmap_irec *chunk_rec, + struct xfs_inode **ipp) +{ + struct xfs_rmap_irec *irec; + int error; + +retry: + if (iter->rec_idx == iter->rec_count) { + error = xfs_zone_gc_query(mp, iter); + if (error) + goto fail; + if (!iter->victim_rtg) + return false; + } + + irec = &iter->recs[iter->rec_idx]; + error = xfs_iget(mp, NULL, irec->rm_owner, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); + if (error) { + /* + * If the inode was already deleted, skip over it. + */ + if (error == -ENOENT) { + iter->rec_idx++; + goto retry; + } + goto fail; + } + + if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { + iter->rec_idx++; + xfs_irele(*ipp); + goto retry; + } + + *chunk_rec = *irec; + return true; + +fail: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; +} + +static void +xfs_zone_gc_iter_advance( + struct xfs_zone_gc_iter *iter, + xfs_extlen_t count_fsb) +{ + struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; + + irec->rm_offset += count_fsb; + irec->rm_startblock += count_fsb; + irec->rm_blockcount -= count_fsb; + if (!irec->rm_blockcount) + iter->rec_idx++; +} + +static struct xfs_rtgroup * +xfs_zone_gc_pick_victim_from( + struct xfs_mount *mp, + uint32_t bucket) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t victim_used = U32_MAX; + struct xfs_rtgroup *victim_rtg = NULL; + uint32_t bit; + + if (!zi->zi_used_bucket_entries[bucket]) + return NULL; + + for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], + mp->m_sb.sb_rgcount) { + struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); + + if (!rtg) + continue; + + /* + * If the zone is already undergoing GC, don't pick it again. + * + * This prevents us from picking one of the zones for which we + * already submitted GC I/O, but for which the remapping hasn't + * concluded yet. This won't cause data corruption, but + * increases write amplification and slows down GC, so this is + * a bad thing. + */ + if (atomic_read(&rtg->rtg_gccount)) { + xfs_rtgroup_rele(rtg); + continue; + } + + /* skip zones that are just waiting for a reset */ + if (rtg_rmap(rtg)->i_used_blocks == 0 || + rtg_rmap(rtg)->i_used_blocks >= victim_used) { + xfs_rtgroup_rele(rtg); + continue; + } + + if (victim_rtg) + xfs_rtgroup_rele(victim_rtg); + victim_rtg = rtg; + victim_used = rtg_rmap(rtg)->i_used_blocks; + + /* + * Any zone that is less than 1 percent used is fair game for + * instant reclaim. All of these zones are in the last + * bucket, so avoid the expensive division for the zones + * in the other buckets. + */ + if (bucket == 0 && + rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) + break; + } + + return victim_rtg; +} + +/* + * Iterate through all zones marked as reclaimable and find a candidate to + * reclaim. + */ +static bool +xfs_zone_gc_select_victim( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_rtgroup *victim_rtg = NULL; + unsigned int bucket; + + spin_lock(&zi->zi_used_buckets_lock); + for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { + victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); + if (victim_rtg) + break; + } + spin_unlock(&zi->zi_used_buckets_lock); + + if (!victim_rtg) + return false; + + trace_xfs_zone_gc_select_victim(victim_rtg, bucket); + xfs_zone_gc_iter_init(iter, victim_rtg); + return true; +} + +static int +xfs_zone_gc_steal_open_zone( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_info *zi = data->mp->m_zone_info; + struct xfs_open_zone *oz, *found = NULL; + + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { + if (!found || oz->oz_allocated < found->oz_allocated) + found = oz; + } + if (!found) { + spin_unlock(&zi->zi_open_zones_lock); + return -EIO; + } + + trace_xfs_zone_gc_target_stolen(found->oz_rtg); + found->oz_is_gc = true; + zi->zi_nr_open_zones--; + zi->zi_nr_open_gc_zones++; + spin_unlock(&zi->zi_open_zones_lock); + + atomic_inc(&found->oz_ref); + data->oz = found; + return 0; +} + +/* + * Ensure we have a valid open zone to write to. + */ +static bool +xfs_zone_gc_select_target( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_info *zi = data->mp->m_zone_info; + + if (data->oz) { + /* + * If we have space available, just keep using the existing + * zone. + */ + if (data->oz->oz_allocated < rtg_blocks(data->oz->oz_rtg)) + return true; + + /* + * Wait for all writes to the current zone to finish before + * picking a new one. + */ + if (data->oz->oz_written < rtg_blocks(data->oz->oz_rtg)) + return false; + + xfs_open_zone_put(data->oz); + } + + /* + * Open a new zone when there is none currently in use. + */ + ASSERT(zi->zi_nr_open_zones <= + data->mp->m_max_open_zones - XFS_OPEN_GC_ZONES); + data->oz = xfs_open_zone(data->mp, WRITE_LIFE_NOT_SET, true); + if (!data->oz) + return false; + trace_xfs_zone_gc_target_opened(data->oz->oz_rtg); + atomic_inc(&data->oz->oz_ref); + spin_lock(&zi->zi_open_zones_lock); + zi->zi_nr_open_gc_zones++; + list_add_tail(&data->oz->oz_entry, &zi->zi_open_zones); + spin_unlock(&zi->zi_open_zones_lock); + return true; +} + +static void +xfs_zone_gc_end_io( + struct bio *bio) +{ + struct xfs_gc_bio *chunk = + container_of(bio, struct xfs_gc_bio, bio); + struct xfs_zone_gc_data *data = chunk->data; + + WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); + wake_up_process(data->mp->m_zone_info->zi_gc_thread); +} + +static bool +xfs_zone_gc_alloc_blocks( + struct xfs_zone_gc_data *data, + xfs_extlen_t *count_fsb, + xfs_daddr_t *daddr, + bool *is_seq) +{ + struct xfs_mount *mp = data->mp; + struct xfs_open_zone *oz = data->oz; + + *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available)); + + /* + * Directly allocate GC blocks from the reserved pool. + * + * If we'd take them from the normal pool we could be stealing blocks + * from a regular writer, which would then have to wait for GC and + * deadlock. + */ + spin_lock(&mp->m_sb_lock); + *count_fsb = min(*count_fsb, + rtg_blocks(oz->oz_rtg) - oz->oz_allocated); + *count_fsb = min3(*count_fsb, + mp->m_free[XC_FREE_RTEXTENTS].res_avail, + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; + mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; + spin_unlock(&mp->m_sb_lock); + + if (!*count_fsb) + return false; + + *daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); + if (!*is_seq) + *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); + oz->oz_allocated += *count_fsb; + atomic_inc(&oz->oz_ref); + return true; +} + +static void +xfs_zone_gc_add_data( + struct xfs_gc_bio *chunk) +{ + struct xfs_zone_gc_data *data = chunk->data; + unsigned int len = chunk->len; + unsigned int off = data->scratch_head; + + do { + unsigned int this_off = off % XFS_GC_BUF_SIZE; + unsigned int this_len = min(len, XFS_GC_BUF_SIZE - this_off); + + bio_add_folio_nofail(&chunk->bio, + data->scratch_folios[off / XFS_GC_BUF_SIZE], + this_len, this_off); + len -= this_len; + off += this_len; + if (off == data->scratch_size) + off = 0; + } while (len); +} + +static bool +xfs_zone_gc_can_start_chunk( + struct xfs_zone_gc_data *data) +{ + + if (xfs_is_shutdown(data->mp)) + return false; + if (!data->scratch_available) + return false; + + if (!data->iter.victim_rtg) { + if (kthread_should_stop() || kthread_should_park()) + return false; + if (!xfs_zoned_need_gc(data->mp)) + return false; + if (!xfs_zone_gc_select_victim(data)) + return false; + } + + return xfs_zone_gc_select_target(data); +} + +static bool +xfs_zone_gc_start_chunk( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + struct xfs_rmap_irec irec; + struct xfs_gc_bio *chunk; + struct xfs_inode *ip; + struct bio *bio; + xfs_daddr_t daddr; + bool is_seq; + + if (!xfs_zone_gc_can_start_chunk(data)) + return false; + + set_current_state(TASK_RUNNING); + if (!xfs_zone_gc_iter_irec(mp, iter, &irec, &ip)) + return false; + + if (!xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, + &is_seq)) { + xfs_irele(ip); + return false; + } + + /* + * Scratch allocation can wrap around to the same buffer again, + * provision an extra bvec for that case. + */ + bio = bio_alloc_bioset(bdev, XFS_GC_NR_BUFS + 1, REQ_OP_READ, GFP_NOFS, + &data->bio_set); + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->ip = ip; + chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); + chunk->old_startblock = + xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); + chunk->new_daddr = daddr; + chunk->is_seq = is_seq; + chunk->data = data; + chunk->oz = data->oz; + chunk->victim_rtg = iter->victim_rtg; + atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); + + bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); + bio->bi_end_io = xfs_zone_gc_end_io; + xfs_zone_gc_add_data(chunk); + data->scratch_head = + (data->scratch_head + chunk->len) % data->scratch_size; + data->scratch_available -= chunk->len; + + XFS_STATS_INC(mp, xs_gc_read_calls); + + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->reading); + xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); + + submit_bio(bio); + return true; +} + +static void +xfs_zone_gc_free_chunk( + struct xfs_gc_bio *chunk) +{ + atomic_dec(&chunk->victim_rtg->rtg_gccount); + xfs_rtgroup_rele(chunk->victim_rtg); + list_del(&chunk->entry); + xfs_open_zone_put(chunk->oz); + xfs_irele(chunk->ip); + bio_put(&chunk->bio); +} + +static void +xfs_zone_gc_submit_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + if (chunk->is_seq) { + chunk->bio.bi_opf &= ~REQ_OP_WRITE; + chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; + } + chunk->bio.bi_iter.bi_sector = chunk->new_daddr; + chunk->bio.bi_end_io = xfs_zone_gc_end_io; + submit_bio(&chunk->bio); +} + +static struct xfs_gc_bio * +xfs_zone_gc_split_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + struct queue_limits *lim = + &bdev_get_queue(chunk->bio.bi_bdev)->limits; + struct xfs_gc_bio *split_chunk; + int split_sectors; + unsigned int split_len; + struct bio *split; + unsigned int nsegs; + + if (!chunk->is_seq) + return NULL; + + split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, + lim->max_zone_append_sectors << SECTOR_SHIFT); + if (!split_sectors) + return NULL; + + /* ensure the split chunk is still block size aligned */ + split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, + data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; + split_len = split_sectors << SECTOR_SHIFT; + + split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); + split_chunk = container_of(split, struct xfs_gc_bio, bio); + split_chunk->data = data; + ihold(VFS_I(chunk->ip)); + split_chunk->ip = chunk->ip; + split_chunk->is_seq = chunk->is_seq; + split_chunk->offset = chunk->offset; + split_chunk->len = split_len; + split_chunk->old_startblock = chunk->old_startblock; + split_chunk->new_daddr = chunk->new_daddr; + split_chunk->oz = chunk->oz; + atomic_inc(&chunk->oz->oz_ref); + + split_chunk->victim_rtg = chunk->victim_rtg; + atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); + + chunk->offset += split_len; + chunk->len -= split_len; + chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); + + /* add right before the original chunk */ + WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&split_chunk->entry, &chunk->entry); + return split_chunk; +} + +static void +xfs_zone_gc_write_chunk( + struct xfs_gc_bio *chunk) +{ + struct xfs_zone_gc_data *data = chunk->data; + struct xfs_mount *mp = chunk->ip->i_mount; + struct xfs_gc_bio *split_chunk; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + XFS_STATS_INC(mp, xs_gc_write_calls); + XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len); + + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_move_tail(&chunk->entry, &data->writing); + + bio_reuse(&chunk->bio, REQ_OP_WRITE); + while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) + xfs_zone_gc_submit_write(data, split_chunk); + xfs_zone_gc_submit_write(data, chunk); +} + +static void +xfs_zone_gc_finish_chunk( + struct xfs_gc_bio *chunk) +{ + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + struct xfs_zone_gc_data *data = chunk->data; + struct xfs_inode *ip = chunk->ip; + struct xfs_mount *mp = ip->i_mount; + int error; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + data->scratch_tail = + (data->scratch_tail + chunk->len) % data->scratch_size; + data->scratch_available += chunk->len; + + /* + * Cycle through the iolock and wait for direct I/O and layouts to + * ensure no one is reading from the old mapping before it goes away. + * + * Note that xfs_zoned_end_io() below checks that no other writer raced + * with us to update the mapping by checking that the old startblock + * didn't change. + */ + xfs_ilock(ip, iolock); + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); + if (!error) + inode_dio_wait(VFS_I(ip)); + xfs_iunlock(ip, iolock); + if (error) + goto free; + + if (chunk->is_seq) + chunk->new_daddr = chunk->bio.bi_iter.bi_sector; + error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, + chunk->new_daddr, chunk->oz, chunk->old_startblock); +free: + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_zone_gc_free_chunk(chunk); +} + +static void +xfs_zone_gc_finish_reset( + struct xfs_gc_bio *chunk) +{ + struct xfs_rtgroup *rtg = chunk->bio.bi_private; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + + if (chunk->bio.bi_status) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out; + } + + xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE); + atomic_inc(&zi->zi_nr_free_zones); + + xfs_zoned_add_available(mp, rtg_blocks(rtg)); + + wake_up_all(&zi->zi_zone_wait); +out: + list_del(&chunk->entry); + bio_put(&chunk->bio); +} + +static void +xfs_submit_zone_reset_bio( + struct bio *bio, + void *priv) +{ + struct xfs_rtgroup *rtg = priv; + struct xfs_mount *mp = rtg_mount(rtg); + + trace_xfs_zone_reset(rtg); + + ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); + + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) { + bio_io_error(bio); + return; + } + + XFS_STATS_INC(mp, xs_gc_zone_reset_calls); + + bio->bi_iter.bi_sector = xfs_gbno_to_daddr(rtg_group(rtg), 0); + if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { + /* + * Also use the bio to drive the state machine when neither + * zone reset nor discard is supported to keep things simple. + */ + if (!bdev_max_discard_sectors(bio->bi_bdev)) { + bio_endio(bio); + return; + } + bio->bi_opf &= ~REQ_OP_ZONE_RESET; + bio->bi_opf |= REQ_OP_DISCARD; + bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg)); + } + + submit_bio(bio); +} + +int +xfs_zone_gc_reset_sync( + struct xfs_rtgroup *rtg) +{ + struct bio bio; + int error; + + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, + REQ_OP_ZONE_RESET | REQ_SYNC); + bio_await(&bio, rtg, xfs_submit_zone_reset_bio); + error = blk_status_to_errno(bio.bi_status); + bio_uninit(&bio); + return error; +} + +static void +xfs_zone_gc_reset_zones( + struct xfs_zone_gc_data *data, + struct xfs_group *reset_list) +{ + struct xfs_group *next = reset_list; + + if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { + xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); + return; + } + + do { + struct xfs_rtgroup *rtg = to_rtg(next); + struct xfs_gc_bio *chunk; + struct bio *bio; + + xfs_log_force_inode(rtg_rmap(rtg)); + + next = rtg_group(rtg)->xg_next_reset; + rtg_group(rtg)->xg_next_reset = NULL; + + bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, + 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); + bio->bi_private = rtg; + bio->bi_end_io = xfs_zone_gc_end_io; + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->data = data; + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->resetting); + xfs_submit_zone_reset_bio(bio, rtg); + } while (next); +} + +/* + * Handle the work to read and write data for GC and to reset the zones, + * including handling all completions. + * + * Note that the order of the chunks is preserved so that we don't undo the + * optimal order established by xfs_zone_gc_query(). + */ +static void +xfs_zone_gc_handle_work( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_info *zi = data->mp->m_zone_info; + struct xfs_gc_bio *chunk, *next; + struct xfs_group *reset_list; + struct blk_plug plug; + + spin_lock(&zi->zi_reset_list_lock); + reset_list = zi->zi_reset_list; + zi->zi_reset_list = NULL; + spin_unlock(&zi->zi_reset_list_lock); + + if (reset_list) { + set_current_state(TASK_RUNNING); + xfs_zone_gc_reset_zones(data, reset_list); + } + + list_for_each_entry_safe(chunk, next, &data->resetting, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + set_current_state(TASK_RUNNING); + xfs_zone_gc_finish_reset(chunk); + } + + list_for_each_entry_safe(chunk, next, &data->writing, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + set_current_state(TASK_RUNNING); + xfs_zone_gc_finish_chunk(chunk); + } + + blk_start_plug(&plug); + list_for_each_entry_safe(chunk, next, &data->reading, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + set_current_state(TASK_RUNNING); + xfs_zone_gc_write_chunk(chunk); + } + blk_finish_plug(&plug); + + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); +} + +/* + * Note that the current GC algorithm would break reflinks and thus duplicate + * data that was shared by multiple owners before. Because of that reflinks + * are currently not supported on zoned file systems and can't be created or + * mounted. + */ +static int +xfs_zoned_gcd( + void *private) +{ + struct xfs_zone_gc_data *data = private; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + xfs_set_zonegc_running(mp); + + xfs_zone_gc_handle_work(data); + + /* + * Only sleep if nothing set the state to running. Else check for + * work again as someone might have queued up more work and woken + * us in the meantime. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); + continue; + } + + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !zi->zi_reset_list) { + xfs_clear_zonegc_running(mp); + xfs_zoned_resv_wake_all(mp); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + kthread_parkme(); + continue; + } + } + + schedule(); + } + xfs_clear_zonegc_running(mp); + + if (data->oz) + xfs_open_zone_put(data->oz); + if (data->iter.victim_rtg) + xfs_rtgroup_rele(data->iter.victim_rtg); + + memalloc_nofs_restore(nofs_flag); + xfs_zone_gc_data_free(data); + return 0; +} + +void +xfs_zone_gc_start( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_unpark(mp->m_zone_info->zi_gc_thread); +} + +void +xfs_zone_gc_stop( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_park(mp->m_zone_info->zi_gc_thread); +} + +void +xfs_zone_gc_wakeup( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + /* + * If we are unmounting the file system we must not try to + * wake gc as m_zone_info might have been freed already. + */ + if (down_read_trylock(&sb->s_umount)) { + if (!xfs_is_readonly(mp)) + wake_up_process(mp->m_zone_info->zi_gc_thread); + up_read(&sb->s_umount); + } +} + +int +xfs_zone_gc_mount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_gc_data *data; + int error; + + data = xfs_zone_gc_data_alloc(mp); + if (!data) + return -ENOMEM; + + /* + * If there are no free zones available for GC, or the number of open + * zones has reached the open zone limit, pick the open zone with + * the least used space to GC into. This should only happen after an + * unclean shutdown while GC was ongoing. Otherwise a GC zone will + * be selected from the free zone pool on demand. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || + zi->zi_nr_open_zones >= mp->m_max_open_zones) { + error = xfs_zone_gc_steal_open_zone(data); + if (error) { + xfs_warn(mp, "unable to steal an open zone for gc"); + goto out_free_gc_data; + } + } + + zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, + "xfs-zone-gc/%s", mp->m_super->s_id); + if (IS_ERR(zi->zi_gc_thread)) { + xfs_warn(mp, "unable to create zone gc thread"); + error = PTR_ERR(zi->zi_gc_thread); + goto out_put_oz; + } + + /* xfs_zone_gc_start will unpark for rw mounts */ + kthread_park(zi->zi_gc_thread); + return 0; + +out_put_oz: + if (data->oz) + xfs_open_zone_put(data->oz); +out_free_gc_data: + xfs_zone_gc_data_free(data); + return error; +} + +void +xfs_zone_gc_unmount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + + kthread_stop(zi->zi_gc_thread); +} |
