summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_zone_space_resv.c
blob: 93c9a7721139472f2ed38b754a2218a7ec46bba9 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2023-2025 Christoph Hellwig.
 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
 */
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"

/*
 * Note: the zoned allocator does not support a rtextsize > 1, so this code and
 * the allocator itself uses file system blocks interchangeable with realtime
 * extents without doing the otherwise required conversions.
 */

/*
 * Per-task space reservation.
 *
 * Tasks that need to wait for GC to free up space allocate one of these
 * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
 * The GC thread will then wake the tasks in order when space becomes available.
 */
struct xfs_zone_reservation {
	struct list_head	entry;
	struct task_struct	*task;
	xfs_filblks_t		count_fsb;
};

/*
 * Calculate the number of reserved blocks.
 *
 * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
 * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
 * available for writes without waiting for GC.
 *
 * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
 * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
 * is further restricted by at least one zone as well as the optional
 * persistently reserved blocks.  This allows the allocator to run more
 * smoothly by not always triggering GC.
 */
uint64_t
xfs_zoned_default_resblks(
	struct xfs_mount	*mp,
	enum xfs_free_counter	ctr)
{
	switch (ctr) {
	case XC_FREE_RTEXTENTS:
		return (uint64_t)XFS_RESERVED_ZONES *
			mp->m_groups[XG_TYPE_RTG].blocks +
			mp->m_sb.sb_rtreserved;
	case XC_FREE_RTAVAILABLE:
		return (uint64_t)XFS_GC_ZONES *
			mp->m_groups[XG_TYPE_RTG].blocks;
	default:
		ASSERT(0);
		return 0;
	}
}

void
xfs_zoned_resv_wake_all(
	struct xfs_mount		*mp)
{
	struct xfs_zone_info		*zi = mp->m_zone_info;
	struct xfs_zone_reservation	*reservation;

	spin_lock(&zi->zi_reservation_lock);
	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
		wake_up_process(reservation->task);
	spin_unlock(&zi->zi_reservation_lock);
}

void
xfs_zoned_add_available(
	struct xfs_mount		*mp,
	xfs_filblks_t			count_fsb)
{
	struct xfs_zone_info		*zi = mp->m_zone_info;
	struct xfs_zone_reservation	*reservation;

	if (list_empty_careful(&zi->zi_reclaim_reservations)) {
		xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
		return;
	}

	spin_lock(&zi->zi_reservation_lock);
	xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
	count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
		if (reservation->count_fsb > count_fsb)
			break;
		wake_up_process(reservation->task);
		count_fsb -= reservation->count_fsb;

	}
	spin_unlock(&zi->zi_reservation_lock);
}

static int
xfs_zoned_space_wait_error(
	struct xfs_mount		*mp)
{
	if (xfs_is_shutdown(mp))
		return -EIO;
	if (fatal_signal_pending(current))
		return -EINTR;
	return 0;
}

static int
xfs_zoned_reserve_available(
	struct xfs_inode		*ip,
	xfs_filblks_t			count_fsb,
	unsigned int			flags)
{
	struct xfs_mount		*mp = ip->i_mount;
	struct xfs_zone_info		*zi = mp->m_zone_info;
	struct xfs_zone_reservation	reservation = {
		.task		= current,
		.count_fsb	= count_fsb,
	};
	int				error;

	/*
	 * If there are no waiters, try to directly grab the available blocks
	 * from the percpu counter.
	 *
	 * If the caller wants to dip into the reserved pool also bypass the
	 * wait list.  This relies on the fact that we have a very graciously
	 * sized reserved pool that always has enough space.  If the reserved
	 * allocations fail we're in trouble.
	 */
	if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
	    (flags & XFS_ZR_RESERVED))) {
		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
				flags & XFS_ZR_RESERVED);
		if (error != -ENOSPC)
			return error;
	}

	if (flags & XFS_ZR_NOWAIT)
		return -EAGAIN;

	spin_lock(&zi->zi_reservation_lock);
	list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
	while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
		set_current_state(TASK_KILLABLE);

		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
				flags & XFS_ZR_RESERVED);
		if (error != -ENOSPC)
			break;

		/*
		 * Make sure to start GC if it is not running already. As we
		 * check the rtavailable count when filling up zones, GC is
		 * normally already running at this point, but in some setups
		 * with very few zones we may completely run out of non-
		 * reserved blocks in between filling zones.
		 */
		if (!xfs_is_zonegc_running(mp))
			wake_up_process(zi->zi_gc_thread);

		/*
		 * If there is no reclaimable group left and we aren't still
		 * processing a pending GC request give up as we're fully out
		 * of space.
		 */
		if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
		    !xfs_is_zonegc_running(mp))
			break;

		spin_unlock(&zi->zi_reservation_lock);
		schedule();
		spin_lock(&zi->zi_reservation_lock);
	}
	list_del(&reservation.entry);
	spin_unlock(&zi->zi_reservation_lock);

	__set_current_state(TASK_RUNNING);
	return error;
}

/*
 * Implement greedy space allocation for short writes by trying to grab all
 * that is left after locking out other threads from trying to do the same.
 *
 * This isn't exactly optimal and can hopefully be replaced by a proper
 * percpu_counter primitive one day.
 */
static int
xfs_zoned_reserve_extents_greedy(
	struct xfs_inode		*ip,
	xfs_filblks_t			*count_fsb,
	unsigned int			flags)
{
	struct xfs_mount		*mp = ip->i_mount;
	struct xfs_zone_info		*zi = mp->m_zone_info;
	s64				len = *count_fsb;
	int				error = -ENOSPC;

	spin_lock(&zi->zi_reservation_lock);
	len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
	if (len > 0) {
		*count_fsb = len;
		error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
				flags & XFS_ZR_RESERVED);
	}
	spin_unlock(&zi->zi_reservation_lock);
	return error;
}

int
xfs_zoned_space_reserve(
	struct xfs_inode		*ip,
	xfs_filblks_t			count_fsb,
	unsigned int			flags,
	struct xfs_zone_alloc_ctx	*ac)
{
	struct xfs_mount		*mp = ip->i_mount;
	int				error;

	ASSERT(ac->reserved_blocks == 0);
	ASSERT(ac->open_zone == NULL);

	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
			flags & XFS_ZR_RESERVED);
	if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
		error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
	if (error)
		return error;

	error = xfs_zoned_reserve_available(ip, count_fsb, flags);
	if (error) {
		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
		return error;
	}
	ac->reserved_blocks = count_fsb;
	return 0;
}

void
xfs_zoned_space_unreserve(
	struct xfs_inode		*ip,
	struct xfs_zone_alloc_ctx	*ac)
{
	if (ac->reserved_blocks > 0) {
		struct xfs_mount	*mp = ip->i_mount;

		xfs_zoned_add_available(mp, ac->reserved_blocks);
		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
	}
	if (ac->open_zone)
		xfs_open_zone_put(ac->open_zone);
}