fs/bcachefs/disk_accounting_format.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H

#include "replicas_format.h"

/*
 * Disk accounting - KEY_TYPE_accounting - on disk format:
 *
 * Here, the key has considerably more structure than a typical key (bpos); an
 * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
 *
 * More specifically: a key is just a muliword integer (where word endianness
 * matches native byte order), so we're treating bpos as an opaque 20 byte
 * integer and mapping bch_accounting_key to that.
 *
 * This is a type-tagged union of all our various subtypes; a disk accounting
 * key can be device counters, replicas counters, et cetera - it's extensible.
 *
 * The value is a list of u64s or s64s; the number of counters is specific to a
 * given accounting type.
 *
 * Unlike with other key types, updates are _deltas_, and the deltas are not
 * resolved until the update to the underlying btree, done by btree write buffer
 * flush or journal replay.
 *
 * Journal replay in particular requires special handling. The journal tracks a
 * range of entries which may possibly have not yet been applied to the btree
 * yet - it does not know definitively whether individual entries are dirty and
 * still need to be applied.
 *
 * To handle this, we use the version field of struct bkey, and give every
 * accounting update a unique version number - a total ordering in time; the
 * version number is derived from the key's position in the journal. Then
 * journal replay can compare the version number of the key from the journal
 * with the version number of the key in the btree to determine if a key needs
 * to be replayed.
 *
 * For this to work, we must maintain this strict time ordering of updates as
 * they are flushed to the btree, both via write buffer flush and via journal
 * replay. This has complications for the write buffer code while journal replay
 * is still in progress; the write buffer cannot flush any accounting keys to
 * the btree until journal replay has finished replaying its accounting keys, or
 * the (newer) version number of the keys from the write buffer will cause
 * updates from journal replay to be lost.
 */

struct bch_accounting {
	struct bch_val		v;
	__u64			d[];
};

#define BCH_ACCOUNTING_MAX_COUNTERS		3

#define BCH_DATA_TYPES()		\
	x(free,		0)		\
	x(sb,		1)		\
	x(journal,	2)		\
	x(btree,	3)		\
	x(user,		4)		\
	x(cached,	5)		\
	x(parity,	6)		\
	x(stripe,	7)		\
	x(need_gc_gens,	8)		\
	x(need_discard,	9)		\
	x(unstriped,	10)

enum bch_data_type {
#define x(t, n) BCH_DATA_##t,
	BCH_DATA_TYPES()
#undef x
	BCH_DATA_NR
};

static inline bool data_type_is_empty(enum bch_data_type type)
{
	switch (type) {
	case BCH_DATA_free:
	case BCH_DATA_need_gc_gens:
	case BCH_DATA_need_discard:
		return true;
	default:
		return false;
	}
}

static inline bool data_type_is_hidden(enum bch_data_type type)
{
	switch (type) {
	case BCH_DATA_sb:
	case BCH_DATA_journal:
		return true;
	default:
		return false;
	}
}

/*
 * field 1: name
 * field 2: id
 * field 3: number of counters (max 3)
 */

#define BCH_DISK_ACCOUNTING_TYPES()		\
	x(nr_inodes,		0,	1)	\
	x(persistent_reserved,	1,	1)	\
	x(replicas,		2,	1)	\
	x(dev_data_type,	3,	3)	\
	x(compression,		4,	3)	\
	x(snapshot,		5,	1)	\
	x(btree,		6,	1)	\
	x(rebalance_work,	7,	1)	\
	x(inum,			8,	3)

enum disk_accounting_type {
#define x(f, nr, ...)	BCH_DISK_ACCOUNTING_##f	= nr,
	BCH_DISK_ACCOUNTING_TYPES()
#undef x
	BCH_DISK_ACCOUNTING_TYPE_NR,
};

/*
 * No subtypes - number of inodes in the entire filesystem
 *
 * XXX: perhaps we could add a per-subvolume counter?
 */
struct bch_acct_nr_inodes {
};

/*
 * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
 * reservation:
 */
struct bch_acct_persistent_reserved {
	__u8			nr_replicas;
};

/*
 * device, data type counter fields:
 * [
 *   nr_buckets
 *   live sectors (in buckets of that data type)
 *   sectors of internal fragmentation
 * ]
 *
 * XXX: live sectors should've been done differently, you can have multiple data
 * types in the same bucket (user, stripe, cached) and this collapses them to
 * the bucket data type, and makes the internal fragmentation counter redundant
 */
struct bch_acct_dev_data_type {
	__u8			dev;
	__u8			data_type;
};

/*
 * Compression type fields:
 * [
 *   number of extents
 *   uncompressed size
 *   compressed size
 * ]
 *
 * Compression ratio, average extent size (fragmentation).
 */
struct bch_acct_compression {
	__u8			type;
};

/*
 * On disk usage by snapshot id; counts same values as replicas counter, but
 * aggregated differently
 */
struct bch_acct_snapshot {
	__u32			id;
} __packed;

struct bch_acct_btree {
	__u32			id;
} __packed;

/*
 * inum counter fields:
 * [
 *   number of extents
 *   sum of extent sizes - bkey size
 *     this field is similar to inode.bi_sectors, except here extents in
 *     different snapshots but the same inode number are all collapsed to the
 *     same counter
 *   sum of on disk size - same values tracked by replicas counters
 * ]
 *
 * This tracks on disk fragmentation.
 */
struct bch_acct_inum {
	__u64			inum;
} __packed;

/*
 * Simple counter of the amount of data (on disk sectors) rebalance needs to
 * move, extents counted here are also in the rebalance_work btree.
 */
struct bch_acct_rebalance_work {
};

struct disk_accounting_pos {
	union {
	struct {
		__u8				type;
		union {
		struct bch_acct_nr_inodes	nr_inodes;
		struct bch_acct_persistent_reserved	persistent_reserved;
		struct bch_replicas_entry_v1	replicas;
		struct bch_acct_dev_data_type	dev_data_type;
		struct bch_acct_compression	compression;
		struct bch_acct_snapshot	snapshot;
		struct bch_acct_btree		btree;
		struct bch_acct_rebalance_work	rebalance_work;
		struct bch_acct_inum		inum;
		} __packed;
	} __packed;
		struct bpos			_pad;
	};
};

#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */