summaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Makefile6
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c8
-rw-r--r--fs/xfs/libxfs/xfs_attr.c4
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c47
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c111
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_btree.c411
-rw-r--r--fs/xfs/libxfs/xfs_btree.h28
-rw-r--r--fs/xfs/libxfs/xfs_btree_mem.c1
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c10
-rw-r--r--fs/xfs/libxfs/xfs_defer.h2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c9
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h1
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_exchmaps.c4
-rw-r--r--fs/xfs/libxfs/xfs_format.h51
-rw-r--r--fs/xfs/libxfs/xfs_fs.h10
-rw-r--r--fs/xfs/libxfs/xfs_health.h6
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c65
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c201
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h6
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h16
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h4
-rw-r--r--fs/xfs/libxfs/xfs_metadir.c4
-rw-r--r--fs/xfs/libxfs/xfs_metafile.c223
-rw-r--r--fs/xfs/libxfs/xfs_metafile.h13
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h4
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c278
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h23
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c178
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h12
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h9
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.c74
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h58
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.c757
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.h189
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.c1035
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.h210
-rw-r--r--fs/xfs/libxfs/xfs_sb.c14
-rw-r--r--fs/xfs/libxfs/xfs_shared.h21
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c37
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h13
-rw-r--r--fs/xfs/libxfs/xfs_types.h7
-rw-r--r--fs/xfs/scrub/agheader_repair.c2
-rw-r--r--fs/xfs/scrub/alloc_repair.c5
-rw-r--r--fs/xfs/scrub/bmap.c126
-rw-r--r--fs/xfs/scrub/bmap_repair.c148
-rw-r--r--fs/xfs/scrub/common.c170
-rw-r--r--fs/xfs/scrub/common.h31
-rw-r--r--fs/xfs/scrub/cow_repair.c180
-rw-r--r--fs/xfs/scrub/health.c2
-rw-r--r--fs/xfs/scrub/inode.c41
-rw-r--r--fs/xfs/scrub/inode_repair.c205
-rw-r--r--fs/xfs/scrub/metapath.c6
-rw-r--r--fs/xfs/scrub/newbt.c42
-rw-r--r--fs/xfs/scrub/newbt.h1
-rw-r--r--fs/xfs/scrub/orphanage.c9
-rw-r--r--fs/xfs/scrub/quota.c8
-rw-r--r--fs/xfs/scrub/quota_repair.c2
-rw-r--r--fs/xfs/scrub/reap.c288
-rw-r--r--fs/xfs/scrub/reap.h9
-rw-r--r--fs/xfs/scrub/refcount.c2
-rw-r--r--fs/xfs/scrub/refcount_repair.c6
-rw-r--r--fs/xfs/scrub/repair.c197
-rw-r--r--fs/xfs/scrub/repair.h35
-rw-r--r--fs/xfs/scrub/rgb_bitmap.h37
-rw-r--r--fs/xfs/scrub/rgsuper.c6
-rw-r--r--fs/xfs/scrub/rmap_repair.c91
-rw-r--r--fs/xfs/scrub/rtb_bitmap.h37
-rw-r--r--fs/xfs/scrub/rtbitmap.c77
-rw-r--r--fs/xfs/scrub/rtbitmap.h55
-rw-r--r--fs/xfs/scrub/rtbitmap_repair.c451
-rw-r--r--fs/xfs/scrub/rtrefcount.c661
-rw-r--r--fs/xfs/scrub/rtrefcount_repair.c783
-rw-r--r--fs/xfs/scrub/rtrmap.c323
-rw-r--r--fs/xfs/scrub/rtrmap_repair.c1006
-rw-r--r--fs/xfs/scrub/rtsummary.c17
-rw-r--r--fs/xfs/scrub/rtsummary_repair.c3
-rw-r--r--fs/xfs/scrub/scrub.c30
-rw-r--r--fs/xfs/scrub/scrub.h28
-rw-r--r--fs/xfs/scrub/stats.c2
-rw-r--r--fs/xfs/scrub/tempexch.h2
-rw-r--r--fs/xfs/scrub/tempfile.c21
-rw-r--r--fs/xfs/scrub/trace.c1
-rw-r--r--fs/xfs/scrub/trace.h280
-rw-r--r--fs/xfs/xfs_aops.c68
-rw-r--r--fs/xfs/xfs_attr_inactive.c5
-rw-r--r--fs/xfs/xfs_buf.c808
-rw-r--r--fs/xfs/xfs_buf.h19
-rw-r--r--fs/xfs/xfs_buf_item.h5
-rw-r--r--fs/xfs/xfs_buf_item_recover.c19
-rw-r--r--fs/xfs/xfs_buf_mem.c2
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c26
-rw-r--r--fs/xfs/xfs_dquot.h3
-rw-r--r--fs/xfs/xfs_drain.c20
-rw-r--r--fs/xfs/xfs_drain.h7
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_exchrange.c74
-rw-r--r--fs/xfs/xfs_file.c6
-rw-r--r--fs/xfs/xfs_fsmap.c193
-rw-r--r--fs/xfs/xfs_fsops.c30
-rw-r--r--fs/xfs/xfs_health.c2
-rw-r--r--fs/xfs/xfs_inode.c26
-rw-r--r--fs/xfs/xfs_inode.h16
-rw-r--r--fs/xfs/xfs_inode_item.c30
-rw-r--r--fs/xfs/xfs_inode_item_recover.c48
-rw-r--r--fs/xfs/xfs_ioctl.c21
-rw-r--r--fs/xfs/xfs_iomap.c14
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_recover.c6
-rw-r--r--fs/xfs/xfs_mount.c21
-rw-r--r--fs/xfs/xfs_mount.h25
-rw-r--r--fs/xfs/xfs_notify_failure.c230
-rw-r--r--fs/xfs/xfs_notify_failure.h11
-rw-r--r--fs/xfs/xfs_qm.c10
-rw-r--r--fs/xfs/xfs_qm_bhv.c81
-rw-r--r--fs/xfs/xfs_quota.h5
-rw-r--r--fs/xfs/xfs_refcount_item.c240
-rw-r--r--fs/xfs/xfs_reflink.c321
-rw-r--r--fs/xfs/xfs_reflink.h4
-rw-r--r--fs/xfs/xfs_rmap_item.c216
-rw-r--r--fs/xfs/xfs_rtalloc.c123
-rw-r--r--fs/xfs/xfs_rtalloc.h20
-rw-r--r--fs/xfs/xfs_stats.c5
-rw-r--r--fs/xfs/xfs_stats.h3
-rw-r--r--fs/xfs/xfs_super.c155
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sysctl.c2
-rw-r--r--fs/xfs/xfs_trace.h271
-rw-r--r--fs/xfs/xfs_trans.c6
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_ail.c9
-rw-r--r--fs/xfs/xfs_trans_buf.c8
-rw-r--r--fs/xfs/xfs_trans_dquot.c8
138 files changed, 11135 insertions, 1698 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index ed9b0dabc1f1..7afa51e41427 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -51,6 +51,8 @@ xfs-y += $(addprefix libxfs/, \
xfs_rmap_btree.o \
xfs_refcount.o \
xfs_refcount_btree.o \
+ xfs_rtrefcount_btree.o \
+ xfs_rtrmap_btree.o \
xfs_sb.o \
xfs_symlink_remote.o \
xfs_trans_inode.o \
@@ -193,6 +195,8 @@ xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
rgsuper.o \
rtbitmap.o \
+ rtrefcount.o \
+ rtrmap.o \
rtsummary.o \
)
@@ -232,6 +236,8 @@ xfs-y += $(addprefix scrub/, \
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
rtbitmap_repair.o \
+ rtrefcount_repair.o \
+ rtrmap_repair.o \
rtsummary_repair.o \
)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index f5d853089019..fb79215a509d 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -114,6 +114,7 @@ xfs_ag_resv_needed(
case XFS_AG_RESV_RMAPBT:
len -= xfs_perag_resv(pag, type)->ar_reserved;
break;
+ case XFS_AG_RESV_METAFILE:
case XFS_AG_RESV_NONE:
/* empty */
break;
@@ -347,6 +348,7 @@ xfs_ag_resv_alloc_extent(
switch (type) {
case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_METAFILE:
return;
case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_RMAPBT:
@@ -389,6 +391,7 @@ xfs_ag_resv_free_extent(
switch (type) {
case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_METAFILE:
return;
case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_RMAPBT:
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3d33e17f2e5c..7839efe050bf 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -33,8 +33,6 @@ struct kmem_cache *xfs_extfree_item_cache;
struct workqueue_struct *xfs_alloc_wq;
-#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
-
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
@@ -410,8 +408,8 @@ xfs_alloc_compute_diff(
if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
if (newlen1 < newlen2 ||
(newlen1 == newlen2 &&
- XFS_ABSDIFF(newbno1, wantbno) >
- XFS_ABSDIFF(newbno2, wantbno)))
+ abs_diff(newbno1, wantbno) >
+ abs_diff(newbno2, wantbno)))
newbno1 = newbno2;
} else if (newbno2 != NULLAGBLOCK)
newbno1 = newbno2;
@@ -427,7 +425,7 @@ xfs_alloc_compute_diff(
} else
newbno1 = freeend - wantlen;
*newbnop = newbno1;
- return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+ return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno);
}
/*
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 17875ad865f5..8c04acd30d48 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1004,9 +1004,7 @@ xfs_attr_add_fork(
unsigned int blks; /* space reservation */
int error; /* error return value */
- if (xfs_is_metadir_inode(ip))
- ASSERT(XFS_IS_DQDETACHED(ip));
- else
+ if (!xfs_is_metadir_inode(ip))
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
blks = XFS_ADDAFORK_SPACE_RES(mp);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 5255f93bae31..0ef19f1469ec 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -615,7 +615,7 @@ xfs_bmap_btree_to_extents(
xfs_trans_binval(tp, cbp);
if (cur->bc_levels[0].bp == cbp)
cur->bc_levels[0].bp = NULL;
- xfs_iroot_realloc(ip, -1, whichfork);
+ xfs_bmap_broot_realloc(ip, whichfork, 0);
ASSERT(ifp->if_broot == NULL);
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -659,12 +659,11 @@ xfs_bmap_extents_to_btree(
* Make space in the inode incore. This needs to be undone if we fail
* to expand the root.
*/
- xfs_iroot_realloc(ip, 1, whichfork);
+ block = xfs_bmap_broot_realloc(ip, whichfork, 1);
/*
* Fill in the root.
*/
- block = ifp->if_broot;
xfs_bmbt_init_block(ip, block, NULL, 1, 1);
/*
* Need a cursor. Can't allocate until bb_level is filled in.
@@ -746,7 +745,7 @@ xfs_bmap_extents_to_btree(
out_unreserve_dquot:
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
out_root_realloc:
- xfs_iroot_realloc(ip, -1, whichfork);
+ xfs_bmap_broot_realloc(ip, whichfork, 0);
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
ASSERT(ifp->if_broot == NULL);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -1043,9 +1042,7 @@ xfs_bmap_add_attrfork(
int error; /* error return value */
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- if (xfs_is_metadir_inode(ip))
- ASSERT(XFS_IS_DQDETACHED(ip));
- else
+ if (!xfs_is_metadir_inode(ip))
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
ASSERT(!xfs_inode_has_attr_fork(ip));
@@ -3566,12 +3563,12 @@ xfs_bmap_btalloc_at_eof(
int error;
/*
- * If there are already extents in the file, try an exact EOF block
- * allocation to extend the file as a contiguous extent. If that fails,
- * or it's the first allocation in a file, just try for a stripe aligned
- * allocation.
+ * If there are already extents in the file, and xfs_bmap_adjacent() has
+ * given a better blkno, try an exact EOF block allocation to extend the
+ * file as a contiguous extent. If that fails, or it's the first
+ * allocation in a file, just try for a stripe aligned allocation.
*/
- if (ap->offset) {
+ if (ap->eof) {
xfs_extlen_t nextminlen = 0;
/*
@@ -3739,7 +3736,8 @@ xfs_bmap_btalloc_best_length(
int error;
ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino);
- xfs_bmap_adjacent(ap);
+ if (!xfs_bmap_adjacent(ap))
+ ap->eof = false;
/*
* Search for an allocation group with a single extent large enough for
@@ -4567,8 +4565,9 @@ xfs_bmapi_write(
* the refcount btree for orphan recovery.
*/
if (whichfork == XFS_COW_FORK)
- xfs_refcount_alloc_cow_extent(tp, bma.blkno,
- bma.length);
+ xfs_refcount_alloc_cow_extent(tp,
+ XFS_IS_REALTIME_INODE(ip),
+ bma.blkno, bma.length);
}
/* Deal with the allocated space we found. */
@@ -4743,7 +4742,8 @@ xfs_bmapi_convert_one_delalloc(
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
- xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+ xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip),
+ bma.blkno, bma.length);
error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
whichfork);
@@ -5391,7 +5391,7 @@ xfs_bmap_del_extent_real(
bool isrt = xfs_ifork_is_realtime(ip, whichfork);
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
- xfs_refcount_decrease_extent(tp, del);
+ xfs_refcount_decrease_extent(tp, isrt, del);
} else if (isrt && !xfs_has_rtgroups(mp)) {
error = xfs_bmap_free_rtblocks(tp, del);
} else {
@@ -6501,9 +6501,8 @@ xfs_get_extsz_hint(
* No point in aligning allocations if we need to COW to actually
* write to them.
*/
- if (xfs_is_always_cow_inode(ip))
- return 0;
- if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+ if (!xfs_is_always_cow_inode(ip) &&
+ (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
return ip->i_extsize;
if (XFS_IS_REALTIME_INODE(ip) &&
ip->i_mount->m_sb.sb_rextsize > 1)
@@ -6526,7 +6525,13 @@ xfs_get_cowextsz_hint(
a = 0;
if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
a = ip->i_cowextsize;
- b = xfs_get_extsz_hint(ip);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ b = 0;
+ if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+ b = ip->i_extsize;
+ } else {
+ b = xfs_get_extsz_hint(ip);
+ }
a = max(a, b);
if (a == 0)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 3464be771f95..908d7b050e9c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -516,6 +516,116 @@ xfs_bmbt_keys_contiguous(
be64_to_cpu(key2->bmbt.br_startoff));
}
+static inline void
+xfs_bmbt_move_ptrs(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *broot,
+ short old_size,
+ size_t new_size,
+ unsigned int numrecs)
+{
+ void *dptr;
+ void *sptr;
+
+ sptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, old_size);
+ dptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, new_size);
+ memmove(dptr, sptr, numrecs * sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Reallocate the space for if_broot based on the number of records. Move the
+ * records and pointers in if_broot to fit the new size. When shrinking this
+ * will eliminate holes between the records and pointers created by the caller.
+ * When growing this will create holes to be filled in by the caller.
+ *
+ * The caller must not request to add more records than would fit in the
+ * on-disk inode root. If the if_broot is currently NULL, then if we are
+ * adding records, one will be allocated. The caller must also not request
+ * that the number of records go below zero, although it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * whichfork -- which inode fork to change
+ * new_numrecs -- the new number of records requested for the if_broot array
+ *
+ * Returns the incore btree root block.
+ */
+struct xfs_btree_block *
+xfs_bmap_broot_realloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ unsigned int new_numrecs)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_block *broot;
+ unsigned int new_size;
+ unsigned int old_size = ifp->if_broot_bytes;
+
+ /*
+ * Block mapping btrees do not support storing zero records; if this
+ * happens, the fork is being changed to FMT_EXTENTS. Free the broot
+ * and get out.
+ */
+ if (new_numrecs == 0)
+ return xfs_broot_realloc(ifp, 0);
+
+ new_size = xfs_bmap_broot_space_calc(mp, new_numrecs);
+
+ /* Handle the nop case quietly. */
+ if (new_size == old_size)
+ return ifp->if_broot;
+
+ if (new_size > old_size) {
+ unsigned int old_numrecs;
+
+ /*
+ * If there wasn't any memory allocated before, just
+ * allocate it now and get out.
+ */
+ if (old_size == 0)
+ return xfs_broot_realloc(ifp, new_size);
+
+ /*
+ * If there is already an existing if_broot, then we need
+ * to realloc() it and shift the pointers to their new
+ * location. The records don't change location because
+ * they are kept butted up against the btree block header.
+ */
+ old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false);
+ broot = xfs_broot_realloc(ifp, new_size);
+ ASSERT(xfs_bmap_bmdr_space(broot) <=
+ xfs_inode_fork_size(ip, whichfork));
+ xfs_bmbt_move_ptrs(mp, broot, old_size, new_size, old_numrecs);
+ return broot;
+ }
+
+ /*
+ * We're reducing, but not totally eliminating, numrecs. In this case,
+ * we are shrinking the if_broot buffer, so it must already exist.
+ */
+ ASSERT(ifp->if_broot != NULL && old_size > 0 && new_size > 0);
+
+ /*
+ * Shrink the btree root by moving the bmbt pointers, since they are
+ * not butted up against the btree block header, then reallocating
+ * broot.
+ */
+ xfs_bmbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, new_numrecs);
+ broot = xfs_broot_realloc(ifp, new_size);
+ ASSERT(xfs_bmap_bmdr_space(broot) <=
+ xfs_inode_fork_size(ip, whichfork));
+ return broot;
+}
+
+static struct xfs_btree_block *
+xfs_bmbt_broot_realloc(
+ struct xfs_btree_cur *cur,
+ unsigned int new_numrecs)
+{
+ return xfs_bmap_broot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork,
+ new_numrecs);
+}
+
const struct xfs_btree_ops xfs_bmbt_ops = {
.name = "bmap",
.type = XFS_BTREE_TYPE_INODE,
@@ -543,6 +653,7 @@ const struct xfs_btree_ops xfs_bmbt_ops = {
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
.keys_contiguous = xfs_bmbt_keys_contiguous,
+ .broot_realloc = xfs_bmbt_broot_realloc,
};
/*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 49a3bae3f6ec..b238d559ab03 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -198,4 +198,7 @@ xfs_bmap_bmdr_space(struct xfs_btree_block *bb)
return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs));
}
+struct xfs_btree_block *xfs_bmap_broot_realloc(struct xfs_inode *ip,
+ int whichfork, unsigned int new_numrecs);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 68ee1c299c25..299ce7fd11b0 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -30,6 +30,12 @@
#include "xfs_health.h"
#include "xfs_buf_mem.h"
#include "xfs_btree_mem.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_rmap.h"
+#include "xfs_quota.h"
+#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
/*
* Btree magic numbers.
@@ -1537,12 +1543,16 @@ xfs_btree_log_recs(
int first,
int last)
{
+ if (!bp) {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
+ return;
+ }
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_rec_offset(cur, first),
xfs_btree_rec_offset(cur, last + 1) - 1);
-
}
/*
@@ -3078,6 +3088,131 @@ xfs_btree_split(
#define xfs_btree_split __xfs_btree_split
#endif /* __KERNEL__ */
+/* Move the records from a root leaf block to a separate block. */
+STATIC void
+xfs_btree_promote_leaf_iroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ struct xfs_buf *cbp,
+ union xfs_btree_ptr *cptr,
+ struct xfs_btree_block *cblock)
+{
+ union xfs_btree_rec *rp;
+ union xfs_btree_rec *crp;
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *pp;
+ struct xfs_btree_block *broot;
+ int numrecs = xfs_btree_get_numrecs(block);
+
+ /* Copy the records from the leaf broot into the new child block. */
+ rp = xfs_btree_rec_addr(cur, 1, block);
+ crp = xfs_btree_rec_addr(cur, 1, cblock);
+ xfs_btree_copy_recs(cur, crp, rp, numrecs);
+
+ /*
+ * Increment the tree height.
+ *
+ * Trickery here: The amount of memory that we need per record for the
+ * ifork's btree root block may change when we convert the broot from a
+ * leaf to a node block. Free the existing leaf broot so that nobody
+ * thinks we need to migrate node pointers when we realloc the broot
+ * buffer after bumping nlevels.
+ */
+ cur->bc_ops->broot_realloc(cur, 0);
+ cur->bc_nlevels++;
+ cur->bc_levels[1].ptr = 1;
+
+ /*
+ * Allocate a new node broot and initialize it to point to the new
+ * child block.
+ */
+ broot = cur->bc_ops->broot_realloc(cur, 1);
+ xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops,
+ cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino);
+
+ pp = xfs_btree_ptr_addr(cur, 1, broot);
+ kp = xfs_btree_key_addr(cur, 1, broot);
+ xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+ xfs_btree_get_keys(cur, cblock, kp);
+
+ /* Attach the new block to the cursor and log it. */
+ xfs_btree_setbuf(cur, 0, cbp);
+ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_recs(cur, cbp, 1, numrecs);
+}
+
+/*
+ * Move the keys and pointers from a root block to a separate block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_promote_node_iroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *cbp,
+ union xfs_btree_ptr *cptr,
+ struct xfs_btree_block *cblock)
+{
+ union xfs_btree_key *ckp;
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *cpp;
+ union xfs_btree_ptr *pp;
+ int i;
+ int error;
+ int numrecs = xfs_btree_get_numrecs(block);
+
+ /*
+ * Increase tree height, adjusting the root block level to match.
+ * We cannot change the root btree node size until we've copied the
+ * block contents to the new child block.
+ */
+ be16_add_cpu(&block->bb_level, 1);
+ cur->bc_nlevels++;
+ cur->bc_levels[level + 1].ptr = 1;
+
+ /*
+ * Adjust the root btree record count, then copy the keys from the old
+ * root to the new child block.
+ */
+ xfs_btree_set_numrecs(block, 1);
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, ckp, kp, numrecs);
+
+ /* Check the pointers and copy them to the new child block. */
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+ for (i = 0; i < numrecs; i++) {
+ error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+ if (error)
+ return error;
+ }
+ xfs_btree_copy_ptrs(cur, cpp, pp, numrecs);
+
+ /*
+ * Set the first keyptr to point to the new child block, then shrink
+ * the memory buffer for the root block.
+ */
+ error = xfs_btree_debug_check_ptr(cur, cptr, 0, level);
+ if (error)
+ return error;
+ xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+ xfs_btree_get_keys(cur, cblock, kp);
+
+ cur->bc_ops->broot_realloc(cur, 1);
+
+ /* Attach the new block to the cursor and log it. */
+ xfs_btree_setbuf(cur, level, cbp);
+ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_keys(cur, cbp, 1, numrecs);
+ xfs_btree_log_ptrs(cur, cbp, 1, numrecs);
+ return 0;
+}
+
/*
* Copy the old inode root contents into a real block and make the
* broot point to it.
@@ -3091,14 +3226,10 @@ xfs_btree_new_iroot(
struct xfs_buf *cbp; /* buffer for cblock */
struct xfs_btree_block *block; /* btree block */
struct xfs_btree_block *cblock; /* child btree block */
- union xfs_btree_key *ckp; /* child key pointer */
- union xfs_btree_ptr *cpp; /* child ptr pointer */
- union xfs_btree_key *kp; /* pointer to btree key */
- union xfs_btree_ptr *pp; /* pointer to block addr */
+ union xfs_btree_ptr aptr;
union xfs_btree_ptr nptr; /* new block addr */
int level; /* btree level */
int error; /* error return code */
- int i; /* loop counter */
XFS_BTREE_STATS_INC(cur, newroot);
@@ -3107,10 +3238,15 @@ xfs_btree_new_iroot(
level = cur->bc_nlevels - 1;
block = xfs_btree_get_iroot(cur);
- pp = xfs_btree_ptr_addr(cur, 1, block);
+ ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS));
+ if (level > 0)
+ aptr = *xfs_btree_ptr_addr(cur, 1, block);
+ else
+ aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp,
+ cur->bc_ino.ip->i_ino));
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
+ error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -3136,47 +3272,16 @@ xfs_btree_new_iroot(
cblock->bb_u.s.bb_blkno = bno;
}
- be16_add_cpu(&block->bb_level, 1);
- xfs_btree_set_numrecs(block, 1);
- cur->bc_nlevels++;
- ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
- cur->bc_levels[level + 1].ptr = 1;
-
- kp = xfs_btree_key_addr(cur, 1, block);
- ckp = xfs_btree_key_addr(cur, 1, cblock);
- xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
-
- cpp = xfs_btree_ptr_addr(cur, 1, cblock);
- for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
- error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+ if (level > 0) {
+ error = xfs_btree_promote_node_iroot(cur, block, level, cbp,
+ &nptr, cblock);
if (error)
goto error0;
+ } else {
+ xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock);
}
- xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
-
- error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
- if (error)
- goto error0;
-
- xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
-
- xfs_iroot_realloc(cur->bc_ino.ip,
- 1 - xfs_btree_get_numrecs(cblock),
- cur->bc_ino.whichfork);
-
- xfs_btree_setbuf(cur, level, cbp);
-
- /*
- * Do all this logging at the end so that
- * the root is at the right level.
- */
- xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
- xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
- xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-
- *logflags |=
- XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
+ *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
*stat = 1;
return 0;
error0:
@@ -3347,7 +3452,7 @@ xfs_btree_make_block_unfull(
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
- xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
+ cur->bc_ops->broot_realloc(cur, numrecs + 1);
*stat = 1;
} else {
/* A root block that needs replacing */
@@ -3693,6 +3798,97 @@ error0:
return error;
}
+/* Move the records from a child leaf block to the root block. */
+STATIC void
+xfs_btree_demote_leaf_child(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *cblock,
+ int numrecs)
+{
+ union xfs_btree_rec *rp;
+ union xfs_btree_rec *crp;
+ struct xfs_btree_block *broot;
+
+ /*
+ * Decrease the tree height.
+ *
+ * Trickery here: The amount of memory that we need per record for the
+ * ifork's btree root block may change when we convert the broot from a
+ * node to a leaf. Free the old node broot so that we can get a fresh
+ * leaf broot.
+ */
+ cur->bc_ops->broot_realloc(cur, 0);
+ cur->bc_nlevels--;
+
+ /*
+ * Allocate a new leaf broot and copy the records from the old child.
+ * Detach the old child from the cursor.
+ */
+ broot = cur->bc_ops->broot_realloc(cur, numrecs);
+ xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs,
+ cur->bc_ino.ip->i_ino);
+
+ rp = xfs_btree_rec_addr(cur, 1, broot);
+ crp = xfs_btree_rec_addr(cur, 1, cblock);
+ xfs_btree_copy_recs(cur, rp, crp, numrecs);
+
+ cur->bc_levels[0].bp = NULL;
+}
+
+/*
+ * Move the keyptrs from a child node block to the root block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_demote_node_child(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *cblock,
+ int level,
+ int numrecs)
+{
+ struct xfs_btree_block *block;
+ union xfs_btree_key *ckp;
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *cpp;
+ union xfs_btree_ptr *pp;
+ int i;
+ int error;
+
+ /*
+ * Adjust the root btree node size and the record count to match the
+ * doomed child so that we can copy the keyptrs ahead of changing the
+ * tree shape.
+ */
+ block = cur->bc_ops->broot_realloc(cur, numrecs);
+
+ xfs_btree_set_numrecs(block, numrecs);
+ ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+ /* Copy keys from the doomed block. */
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+ /* Copy pointers from the doomed block. */
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+ for (i = 0; i < numrecs; i++) {
+ error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
+ if (error)
+ return error;
+ }
+ xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+ /* Decrease tree height, adjusting the root block level to match. */
+ cur->bc_levels[level - 1].bp = NULL;
+ be16_add_cpu(&block->bb_level, -1);
+ cur->bc_nlevels--;
+ return 0;
+}
+
/*
* Try to merge a non-leaf block back into the inode root.
*
@@ -3705,34 +3901,31 @@ STATIC int
xfs_btree_kill_iroot(
struct xfs_btree_cur *cur)
{
- int whichfork = cur->bc_ino.whichfork;
struct xfs_inode *ip = cur->bc_ino.ip;
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_block *block;
struct xfs_btree_block *cblock;
- union xfs_btree_key *kp;
- union xfs_btree_key *ckp;
- union xfs_btree_ptr *pp;
- union xfs_btree_ptr *cpp;
struct xfs_buf *cbp;
int level;
- int index;
int numrecs;
int error;
#ifdef DEBUG
union xfs_btree_ptr ptr;
#endif
- int i;
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
- ASSERT(cur->bc_nlevels > 1);
+ ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) ||
+ cur->bc_nlevels > 1);
/*
* Don't deal with the root block needs to be a leaf case.
* We're just going to turn the thing back into extents anyway.
*/
level = cur->bc_nlevels - 1;
- if (level == 1)
+ if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS))
+ goto out0;
+
+ /* If we're already a leaf, jump out. */
+ if (level == 0)
goto out0;
/*
@@ -3762,40 +3955,20 @@ xfs_btree_kill_iroot(
ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
#endif
- index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
- if (index) {
- xfs_iroot_realloc(cur->bc_ino.ip, index,
- cur->bc_ino.whichfork);
- block = ifp->if_broot;
- }
-
- be16_add_cpu(&block->bb_numrecs, index);
- ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-
- kp = xfs_btree_key_addr(cur, 1, block);
- ckp = xfs_btree_key_addr(cur, 1, cblock);
- xfs_btree_copy_keys(cur, kp, ckp, numrecs);
-
- pp = xfs_btree_ptr_addr(cur, 1, block);
- cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-
- for (i = 0; i < numrecs; i++) {
- error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
+ if (level > 1) {
+ error = xfs_btree_demote_node_child(cur, cblock, level,
+ numrecs);
if (error)
return error;
- }
-
- xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+ } else
+ xfs_btree_demote_leaf_child(cur, cblock, numrecs);
error = xfs_btree_free_block(cur, cbp);
if (error)
return error;
- cur->bc_levels[level - 1].bp = NULL;
- be16_add_cpu(&block->bb_level, -1);
xfs_trans_log_inode(cur->bc_tp, ip,
XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
- cur->bc_nlevels--;
out0:
return 0;
}
@@ -3949,10 +4122,10 @@ xfs_btree_delrec(
/*
* We're at the root level. First, shrink the root block in-memory.
* Try to get rid of the next level down. If we can't then there's
- * nothing left to do.
+ * nothing left to do. numrecs was decremented above.
*/
if (xfs_btree_at_iroot(cur, level)) {
- xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork);
+ cur->bc_ops->broot_realloc(cur, numrecs);
error = xfs_btree_kill_iroot(cur);
if (error)
@@ -5360,6 +5533,12 @@ xfs_btree_init_cur_caches(void)
error = xfs_refcountbt_init_cur_cache();
if (error)
goto err;
+ error = xfs_rtrmapbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_rtrefcountbt_init_cur_cache();
+ if (error)
+ goto err;
return 0;
err:
@@ -5376,6 +5555,8 @@ xfs_btree_destroy_cur_caches(void)
xfs_bmbt_destroy_cur_cache();
xfs_rmapbt_destroy_cur_cache();
xfs_refcountbt_destroy_cur_cache();
+ xfs_rtrmapbt_destroy_cur_cache();
+ xfs_rtrefcountbt_destroy_cur_cache();
}
/* Move the btree cursor before the first record. */
@@ -5404,3 +5585,67 @@ xfs_btree_goto_left_edge(
return 0;
}
+
+/* Allocate a block for an inode-rooted metadata btree. */
+int
+xfs_btree_alloc_metafile_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ struct xfs_alloc_arg args = {
+ .mp = cur->bc_mp,
+ .tp = cur->bc_tp,
+ .resv = XFS_AG_RESV_METAFILE,
+ .minlen = 1,
+ .maxlen = 1,
+ .prod = 1,
+ };
+ struct xfs_inode *ip = cur->bc_ino.ip;
+ int error;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork);
+ error = xfs_alloc_vextent_start_ag(&args,
+ XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino));
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK) {
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.len == 1);
+
+ xfs_metafile_resv_alloc_space(ip, &args);
+
+ new->l = cpu_to_be64(args.fsbno);
+ *stat = 1;
+ return 0;
+}
+
+/* Free a block from an inode-rooted metadata btree. */
+int
+xfs_btree_free_metafile_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = cur->bc_ino.ip;
+ struct xfs_trans *tp = cur->bc_tp;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ int error;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
+ error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE,
+ 0);
+ if (error)
+ return error;
+
+ xfs_metafile_resv_free_space(ip, tp, 1);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index c5bff273cae2..355b304696e6 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -135,7 +135,7 @@ struct xfs_btree_ops {
/* offset of btree stats array */
unsigned int statoff;
- /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */
+ /* sick mask for health reporting (not for bmap btrees) */
unsigned int sick_mask;
/* cursor operations */
@@ -213,11 +213,27 @@ struct xfs_btree_ops {
const union xfs_btree_key *key1,
const union xfs_btree_key *key2,
const union xfs_btree_key *mask);
+
+ /*
+ * Reallocate the space for if_broot to fit the number of records.
+ * Move the records and pointers in if_broot to fit the new size. When
+ * shrinking this will eliminate holes between the records and pointers
+ * created by the caller. When growing this will create holes to be
+ * filled in by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root. If the if_broot is currently NULL, then if
+ * we are adding records, one will be allocated. The caller must also
+ * not request that the number of records go below zero, although it
+ * can go to zero.
+ */
+ struct xfs_btree_block *(*broot_realloc)(struct xfs_btree_cur *cur,
+ unsigned int new_numrecs);
};
/* btree geometry flags */
#define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */
-
+#define XFS_BTGEO_IROOT_RECORDS (1U << 1) /* iroot can store records */
union xfs_btree_irec {
struct xfs_alloc_rec_incore a;
@@ -281,7 +297,7 @@ struct xfs_btree_cur
struct {
unsigned int nr_ops; /* # record updates */
unsigned int shape_changes; /* # of extent splits */
- } bc_refc; /* refcountbt */
+ } bc_refc; /* refcountbt/rtrefcountbt */
};
/* Must be at the end of the struct! */
@@ -687,4 +703,10 @@ xfs_btree_at_iroot(
level == cur->bc_nlevels - 1;
}
+int xfs_btree_alloc_metafile_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start, union xfs_btree_ptr *newp,
+ int *stat);
+int xfs_btree_free_metafile_block(struct xfs_btree_cur *cur,
+ struct xfs_buf *bp);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c
index df3d613675a1..f2f7b4305413 100644
--- a/fs/xfs/libxfs/xfs_btree_mem.c
+++ b/fs/xfs/libxfs/xfs_btree_mem.c
@@ -18,6 +18,7 @@
#include "xfs_ag.h"
#include "xfs_buf_item.h"
#include "xfs_trace.h"
+#include "xfs_rtgroup.h"
/* Set the root of an in-memory btree. */
void
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index 694929703152..5ed84f9cc877 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -134,6 +134,7 @@ xfs_btree_stage_ifakeroot(
cur->bc_ino.ifake = ifake;
cur->bc_nlevels = ifake->if_levels;
cur->bc_ino.forksize = ifake->if_fork_size;
+ cur->bc_ino.whichfork = XFS_STAGING_FORK;
cur->bc_flags |= XFS_BTREE_STAGING;
}
@@ -573,6 +574,7 @@ xfs_btree_bload_compute_geometry(
struct xfs_btree_bload *bbl,
uint64_t nr_records)
{
+ const struct xfs_btree_ops *ops = cur->bc_ops;
uint64_t nr_blocks = 0;
uint64_t nr_this_level;
@@ -599,7 +601,7 @@ xfs_btree_bload_compute_geometry(
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
&avg_per_block, &level_blocks, &dontcare64);
- if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+ if (ops->type == XFS_BTREE_TYPE_INODE) {
/*
* If all the items we want to store at this level
* would fit in the inode root block, then we have our
@@ -607,7 +609,9 @@ xfs_btree_bload_compute_geometry(
*
* Note that bmap btrees forbid records in the root.
*/
- if (level != 0 && nr_this_level <= avg_per_block) {
+ if ((level != 0 ||
+ (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) &&
+ nr_this_level <= avg_per_block) {
nr_blocks++;
break;
}
@@ -658,7 +662,7 @@ xfs_btree_bload_compute_geometry(
return -EOVERFLOW;
bbl->btree_height = cur->bc_nlevels;
- if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
+ if (ops->type == XFS_BTREE_TYPE_INODE)
bbl->nr_blocks = nr_blocks - 1;
else
bbl->nr_blocks = nr_blocks;
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index ec51b8465e61..9effd95ddcd4 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -68,7 +68,9 @@ struct xfs_defer_op_type {
extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type;
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 202468223bf9..1775abcfa04d 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -197,7 +197,7 @@ xfs_da_unmount(
/*
* Return 1 if directory contains only "." and "..".
*/
-int
+static bool
xfs_dir_isempty(
xfs_inode_t *dp)
{
@@ -205,9 +205,9 @@ xfs_dir_isempty(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
if (dp->i_disk_size == 0) /* might happen during shutdown. */
- return 1;
+ return true;
if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
- return 0;
+ return false;
sfp = dp->i_df.if_data;
return !sfp->count;
}
@@ -379,12 +379,11 @@ xfs_dir_cilookup_result(
!(args->op_flags & XFS_DA_OP_CILOOKUP))
return -EEXIST;
- args->value = kmalloc(len,
+ args->value = kmemdup(name, len,
GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL);
if (!args->value)
return -ENOMEM;
- memcpy(args->value, name, len);
args->valuelen = len;
return -EEXIST;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 576068ed81fa..a6594a5a941d 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -58,7 +58,6 @@ extern void xfs_dir_startup(void);
extern int xfs_da_mount(struct xfs_mount *mp);
extern void xfs_da_unmount(struct xfs_mount *mp);
-extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 7002d7676a78..a53c5d40e084 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -64,7 +64,8 @@
#define XFS_ERRTAG_WB_DELAY_MS 42
#define XFS_ERRTAG_WRITE_DELAY_MS 43
#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
-#define XFS_ERRTAG_MAX 45
+#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45
+#define XFS_ERRTAG_MAX 46
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -113,5 +114,6 @@
#define XFS_RANDOM_WB_DELAY_MS 3000
#define XFS_RANDOM_WRITE_DELAY_MS 3000
#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1
+#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c
index 2021396651de..3f1d6a98c118 100644
--- a/fs/xfs/libxfs/xfs_exchmaps.c
+++ b/fs/xfs/libxfs/xfs_exchmaps.c
@@ -662,7 +662,9 @@ xfs_exchmaps_rmapbt_blocks(
if (!xfs_has_rmapbt(mp))
return 0;
if (XFS_IS_REALTIME_INODE(req->ip1))
- return 0;
+ return howmany_64(req->nr_exchanges,
+ XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) *
+ XFS_RTRMAPADD_SPACE_RES(mp);
return howmany_64(req->nr_exchanges,
XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 4d47a3e723aa..b1007fb661ba 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -857,6 +857,8 @@ enum xfs_metafile_type {
XFS_METAFILE_PRJQUOTA, /* project quota */
XFS_METAFILE_RTBITMAP, /* rt bitmap */
XFS_METAFILE_RTSUMMARY, /* rt summary */
+ XFS_METAFILE_RTRMAP, /* rt rmap */
+ XFS_METAFILE_RTREFCOUNT, /* rt refcount */
XFS_METAFILE_MAX
} __packed;
@@ -868,7 +870,9 @@ enum xfs_metafile_type {
{ XFS_METAFILE_GRPQUOTA, "grpquota" }, \
{ XFS_METAFILE_PRJQUOTA, "prjquota" }, \
{ XFS_METAFILE_RTBITMAP, "rtbitmap" }, \
- { XFS_METAFILE_RTSUMMARY, "rtsummary" }
+ { XFS_METAFILE_RTSUMMARY, "rtsummary" }, \
+ { XFS_METAFILE_RTRMAP, "rtrmap" }, \
+ { XFS_METAFILE_RTREFCOUNT, "rtrefcount" }
/*
* On-disk inode structure.
@@ -997,7 +1001,8 @@ enum xfs_dinode_fmt {
XFS_DINODE_FMT_LOCAL, /* bulk data */
XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
- XFS_DINODE_FMT_UUID /* added long ago, but never used */
+ XFS_DINODE_FMT_UUID, /* added long ago, but never used */
+ XFS_DINODE_FMT_META_BTREE, /* metadata btree */
};
#define XFS_INODE_FORMAT_STR \
@@ -1005,7 +1010,8 @@ enum xfs_dinode_fmt {
{ XFS_DINODE_FMT_LOCAL, "local" }, \
{ XFS_DINODE_FMT_EXTENTS, "extent" }, \
{ XFS_DINODE_FMT_BTREE, "btree" }, \
- { XFS_DINODE_FMT_UUID, "uuid" }
+ { XFS_DINODE_FMT_UUID, "uuid" }, \
+ { XFS_DINODE_FMT_META_BTREE, "meta_btree" }
/*
* Max values for extnum and aextnum.
@@ -1726,6 +1732,24 @@ typedef __be32 xfs_rmap_ptr_t;
XFS_IBT_BLOCK(mp) + 1)
/*
+ * Realtime Reverse mapping btree format definitions
+ *
+ * This is a btree for reverse mapping records for realtime volumes
+ */
+#define XFS_RTRMAP_CRC_MAGIC 0x4d415052 /* 'MAPR' */
+
+/*
+ * rtrmap root header, on-disk form only.
+ */
+struct xfs_rtrmap_root {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+};
+
+/* inode-based btree pointer type */
+typedef __be64 xfs_rtrmap_ptr_t;
+
+/*
* Reference Count Btree format definitions
*
*/
@@ -1768,12 +1792,29 @@ struct xfs_refcount_key {
__be32 rc_startblock; /* starting block number */
};
-#define MAXREFCOUNT ((xfs_nlink_t)~0U)
-#define MAXREFCEXTLEN ((xfs_extlen_t)~0U)
+#define XFS_REFC_REFCOUNT_MAX ((xfs_nlink_t)~0U)
+#define XFS_REFC_LEN_MAX ((xfs_extlen_t)~0U)
/* btree pointer type */
typedef __be32 xfs_refcount_ptr_t;
+/*
+ * Realtime Reference Count btree format definitions
+ *
+ * This is a btree for reference count records for realtime volumes
+ */
+#define XFS_RTREFC_CRC_MAGIC 0x52434e54 /* 'RCNT' */
+
+/*
+ * rt refcount root header, on-disk form only.
+ */
+struct xfs_rtrefcount_root {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+};
+
+/* inode-rooted btree pointer type */
+typedef __be64 xfs_rtrefcount_ptr_t;
/*
* BMAP Btree format definitions
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 41ce4d3d650e..2c3171262b44 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -737,9 +737,11 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */
#define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */
#define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */
+#define XFS_SCRUB_TYPE_RTRMAPBT 31 /* rtgroup reverse mapping btree */
+#define XFS_SCRUB_TYPE_RTREFCBT 32 /* realtime reference count btree */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 31
+#define XFS_SCRUB_TYPE_NR 33
/*
* This special type code only applies to the vectored scrub implementation.
@@ -829,9 +831,11 @@ struct xfs_scrub_vec_head {
#define XFS_SCRUB_METAPATH_USRQUOTA (5) /* user quota */
#define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */
#define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */
+#define XFS_SCRUB_METAPATH_RTRMAPBT (8) /* realtime reverse mapping */
+#define XFS_SCRUB_METAPATH_RTREFCOUNTBT (9) /* realtime refcount */
/* Number of metapath sm_ino values */
-#define XFS_SCRUB_METAPATH_NR (8)
+#define XFS_SCRUB_METAPATH_NR (10)
/*
* ioctl limits
@@ -993,6 +997,8 @@ struct xfs_rtgroup_geometry {
#define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */
#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */
#define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */
+#define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */
+#define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */
/*
* ioctl commands that are used by Linux filesystems
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index d34986ac18c3..b31000f7190c 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -70,6 +70,8 @@ struct xfs_rtgroup;
#define XFS_SICK_RG_SUPER (1 << 0) /* rt group superblock */
#define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */
#define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */
+#define XFS_SICK_RG_RMAPBT (1 << 3) /* reverse mappings */
+#define XFS_SICK_RG_REFCNTBT (1 << 4) /* reference counts */
/* Observable health issues for AG metadata. */
#define XFS_SICK_AG_SB (1 << 0) /* superblock */
@@ -115,7 +117,9 @@ struct xfs_rtgroup;
#define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \
XFS_SICK_RG_BITMAP | \
- XFS_SICK_RG_SUMMARY)
+ XFS_SICK_RG_SUMMARY | \
+ XFS_SICK_RG_RMAPBT | \
+ XFS_SICK_RG_REFCNTBT)
#define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \
XFS_SICK_AG_AGF | \
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 424861fbf1bd..f24fa628fecf 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -441,6 +441,30 @@ xfs_dinode_verify_fork(
if (di_nextents > max_extents)
return __this_address;
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (!xfs_has_metadir(mp))
+ return __this_address;
+ if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA)))
+ return __this_address;
+ switch (be16_to_cpu(dip->di_metatype)) {
+ case XFS_METAFILE_RTRMAP:
+ /*
+ * growfs must create the rtrmap inodes before adding a
+ * realtime volume to the filesystem, so we cannot use
+ * the rtrmapbt predicate here.
+ */
+ if (!xfs_has_rmapbt(mp))
+ return __this_address;
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ /* same comment about growfs and rmap inodes applies */
+ if (!xfs_has_reflink(mp))
+ return __this_address;
+ break;
+ default:
+ return __this_address;
+ }
+ break;
default:
return __this_address;
}
@@ -460,6 +484,10 @@ xfs_dinode_verify_forkoff(
if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3))
return __this_address;
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (!xfs_has_metadir(mp) || !xfs_has_parent(mp))
+ return __this_address;
+ fallthrough;
case XFS_DINODE_FMT_LOCAL: /* fall through ... */
case XFS_DINODE_FMT_EXTENTS: /* fall through ... */
case XFS_DINODE_FMT_BTREE:
@@ -637,9 +665,6 @@ xfs_dinode_verify(
if (mode && nextents + naextents > nblocks)
return __this_address;
- if (nextents + naextents == 0 && nblocks != 0)
- return __this_address;
-
if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
@@ -723,7 +748,8 @@ xfs_dinode_verify(
return __this_address;
/* don't let reflink and realtime mix */
- if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) &&
+ !xfs_has_rtreflink(mp))
return __this_address;
/* COW extent size hint validation */
@@ -743,6 +769,12 @@ xfs_dinode_verify(
return fa;
}
+ /* metadata inodes containing btrees always have zero extent count */
+ if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_META_BTREE) {
+ if (nextents + naextents == 0 && nblocks != 0)
+ return __this_address;
+ }
+
return NULL;
}
@@ -878,11 +910,29 @@ xfs_inode_validate_cowextsize(
bool rt_flag;
bool hint_flag;
uint32_t cowextsize_bytes;
+ uint32_t blocksize_bytes;
rt_flag = (flags & XFS_DIFLAG_REALTIME);
hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
+ /*
+ * Similar to extent size hints, a directory can be configured to
+ * propagate realtime status and a CoW extent size hint to newly
+ * created files even if there is no realtime device, and the hints on
+ * disk can become misaligned if the sysadmin changes the rt extent
+ * size while adding the realtime device.
+ *
+ * Therefore, we can only enforce the rextsize alignment check against
+ * regular realtime files, and rely on callers to decide when alignment
+ * checks are appropriate, and fix things up as needed.
+ */
+
+ if (rt_flag)
+ blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ else
+ blocksize_bytes = mp->m_sb.sb_blocksize;
+
if (hint_flag && !xfs_has_reflink(mp))
return __this_address;
@@ -896,16 +946,13 @@ xfs_inode_validate_cowextsize(
if (mode && !hint_flag && cowextsize != 0)
return __this_address;
- if (hint_flag && rt_flag)
- return __this_address;
-
- if (cowextsize_bytes % mp->m_sb.sb_blocksize)
+ if (cowextsize_bytes % blocksize_bytes)
return __this_address;
if (cowextsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
- if (cowextsize > mp->m_sb.sb_agblocks / 2)
+ if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2)
return __this_address;
return NULL;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1158ca48626b..4f99b90add55 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -27,6 +27,8 @@
#include "xfs_errortag.h"
#include "xfs_health.h"
#include "xfs_symlink_remote.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
struct kmem_cache *xfs_ifork_cache;
@@ -178,7 +180,7 @@ xfs_iformat_btree(
struct xfs_mount *mp = ip->i_mount;
xfs_bmdr_block_t *dfp;
struct xfs_ifork *ifp;
- /* REFERENCED */
+ struct xfs_btree_block *broot;
int nrecs;
int size;
int level;
@@ -211,16 +213,13 @@ xfs_iformat_btree(
return -EFSCORRUPTED;
}
- ifp->if_broot_bytes = size;
- ifp->if_broot = kmalloc(size,
- GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
- ASSERT(ifp->if_broot != NULL);
+ broot = xfs_broot_alloc(ifp, size);
/*
* Copy and convert from the on-disk structure
* to the in-memory structure.
*/
xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
- ifp->if_broot, size);
+ broot, size);
ifp->if_bytes = 0;
ifp->if_data = NULL;
@@ -270,6 +269,16 @@ xfs_iformat_data_fork(
return xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
case XFS_DINODE_FMT_BTREE:
return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+ case XFS_DINODE_FMT_META_BTREE:
+ switch (ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ return xfs_iformat_rtrmap(ip, dip);
+ case XFS_METAFILE_RTREFCOUNT:
+ return xfs_iformat_rtrefcount(ip, dip);
+ default:
+ break;
+ }
+ fallthrough;
default:
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
dip, sizeof(*dip), __this_address);
@@ -363,135 +372,68 @@ xfs_iformat_attr_fork(
}
/*
- * Reallocate the space for if_broot based on the number of records
- * being added or deleted as indicated in rec_diff. Move the records
- * and pointers in if_broot to fit the new size. When shrinking this
- * will eliminate holes between the records and pointers created by
- * the caller. When growing this will create holes to be filled in
- * by the caller.
- *
- * The caller must not request to add more records than would fit in
- * the on-disk inode root. If the if_broot is currently NULL, then
- * if we are adding records, one will be allocated. The caller must also
- * not request that the number of records go below zero, although
- * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- * requested for the if_broot array.
+ * Allocate the if_broot component of an inode fork so that it is @new_size
+ * bytes in size, using __GFP_NOLOCKDEP like all the other code that
+ * initializes a broot during inode load. Returns if_broot.
*/
-void
-xfs_iroot_realloc(
- xfs_inode_t *ip,
- int rec_diff,
- int whichfork)
+struct xfs_btree_block *
+xfs_broot_alloc(
+ struct xfs_ifork *ifp,
+ size_t new_size)
{
- struct xfs_mount *mp = ip->i_mount;
- int cur_max;
- struct xfs_ifork *ifp;
- struct xfs_btree_block *new_broot;
- int new_max;
- size_t new_size;
- char *np;
- char *op;
+ ASSERT(ifp->if_broot == NULL);
- /*
- * Handle the degenerate case quietly.
- */
- if (rec_diff == 0) {
- return;
- }
+ ifp->if_broot = kmalloc(new_size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+ ifp->if_broot_bytes = new_size;
+ return ifp->if_broot;
+}
- ifp = xfs_ifork_ptr(ip, whichfork);
- if (rec_diff > 0) {
- /*
- * If there wasn't any memory allocated before, just
- * allocate it now and get out.
- */
- if (ifp->if_broot_bytes == 0) {
- new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
- ifp->if_broot = kmalloc(new_size,
- GFP_KERNEL | __GFP_NOFAIL);
- ifp->if_broot_bytes = (int)new_size;
- return;
- }
+/*
+ * Reallocate the if_broot component of an inode fork so that it is @new_size
+ * bytes in size. Returns if_broot.
+ */
+struct xfs_btree_block *
+xfs_broot_realloc(
+ struct xfs_ifork *ifp,
+ size_t new_size)
+{
+ /* No size change? No action needed. */
+ if (new_size == ifp->if_broot_bytes)
+ return ifp->if_broot;
- /*
- * If there is already an existing if_broot, then we need
- * to realloc() it and shift the pointers to their new
- * location. The records don't change location because
- * they are kept butted up against the btree block header.
- */
- cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false);
- new_max = cur_max + rec_diff;
- new_size = xfs_bmap_broot_space_calc(mp, new_max);
- ifp->if_broot = krealloc(ifp->if_broot, new_size,
- GFP_KERNEL | __GFP_NOFAIL);
- op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
- (int)new_size);
- ifp->if_broot_bytes = (int)new_size;
- ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
- xfs_inode_fork_size(ip, whichfork));
- memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
- return;
+ /* New size is zero, free it. */
+ if (new_size == 0) {
+ ifp->if_broot_bytes = 0;
+ kfree(ifp->if_broot);
+ ifp->if_broot = NULL;
+ return NULL;
}
/*
- * rec_diff is less than 0. In this case, we are shrinking the
- * if_broot buffer. It must already exist. If we go to zero
- * records, just get rid of the root and clear the status bit.
+ * Shrinking the iroot means we allocate a new smaller object and copy
+ * it. We don't trust krealloc not to nop on realloc-down.
*/
- ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
- cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false);
- new_max = cur_max + rec_diff;
- ASSERT(new_max >= 0);
- if (new_max > 0)
- new_size = xfs_bmap_broot_space_calc(mp, new_max);
- else
- new_size = 0;
- if (new_size > 0) {
- new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
- /*
- * First copy over the btree block header.
- */
- memcpy(new_broot, ifp->if_broot,
- xfs_bmbt_block_len(ip->i_mount));
- } else {
- new_broot = NULL;
+ if (ifp->if_broot_bytes > 0 && ifp->if_broot_bytes > new_size) {
+ struct xfs_btree_block *old_broot = ifp->if_broot;
+
+ ifp->if_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
+ ifp->if_broot_bytes = new_size;
+ memcpy(ifp->if_broot, old_broot, new_size);
+ kfree(old_broot);
+ return ifp->if_broot;
}
/*
- * Only copy the keys and pointers if there are any.
+ * Growing the iroot means we can krealloc. This may get us the same
+ * object.
*/
- if (new_max > 0) {
- /*
- * First copy the keys.
- */
- op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
- np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
- memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
-
- /*
- * Then copy the pointers.
- */
- op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1,
- (int)new_size);
- memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
- }
- kfree(ifp->if_broot);
- ifp->if_broot = new_broot;
- ifp->if_broot_bytes = (int)new_size;
- if (ifp->if_broot)
- ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
- xfs_inode_fork_size(ip, whichfork));
- return;
+ ifp->if_broot = krealloc(ifp->if_broot, new_size,
+ GFP_KERNEL | __GFP_NOFAIL);
+ ifp->if_broot_bytes = new_size;
+ return ifp->if_broot;
}
-
/*
* This is called when the amount of space needed for if_data
* is increased or decreased. The change in size is indicated by
@@ -671,6 +613,25 @@ xfs_iflush_fork(
}
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ ASSERT(whichfork == XFS_DATA_FORK);
+
+ if (!(iip->ili_fields & brootflag[whichfork]))
+ break;
+
+ switch (ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ xfs_iflush_rtrmap(ip, dip);
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ xfs_iflush_rtrefcount(ip, dip);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ break;
+
default:
ASSERT(0);
break;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 2373d12fd474..69ed0919d60b 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -170,7 +170,11 @@ void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
void xfs_idestroy_fork(struct xfs_ifork *ifp);
void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
int whichfork);
-void xfs_iroot_realloc(struct xfs_inode *, int, int);
+struct xfs_btree_block *xfs_broot_alloc(struct xfs_ifork *ifp,
+ size_t new_size);
+struct xfs_btree_block *xfs_broot_realloc(struct xfs_ifork *ifp,
+ size_t new_size);
+
int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
int);
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 15dec19b6c32..a472ac2e45d0 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -250,6 +250,10 @@ typedef struct xfs_trans_header {
#define XFS_LI_XMD 0x1249 /* mapping exchange done */
#define XFS_LI_EFI_RT 0x124a /* realtime extent free intent */
#define XFS_LI_EFD_RT 0x124b /* realtime extent free done */
+#define XFS_LI_RUI_RT 0x124c /* realtime rmap update intent */
+#define XFS_LI_RUD_RT 0x124d /* realtime rmap update done */
+#define XFS_LI_CUI_RT 0x124e /* realtime refcount update intent */
+#define XFS_LI_CUD_RT 0x124f /* realtime refcount update done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -271,7 +275,11 @@ typedef struct xfs_trans_header {
{ XFS_LI_XMI, "XFS_LI_XMI" }, \
{ XFS_LI_XMD, "XFS_LI_XMD" }, \
{ XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \
- { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }
+ { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }, \
+ { XFS_LI_RUI_RT, "XFS_LI_RUI_RT" }, \
+ { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" }, \
+ { XFS_LI_CUI_RT, "XFS_LI_CUI_RT" }, \
+ { XFS_LI_CUD_RT, "XFS_LI_CUD_RT" }
/*
* Inode Log Item Format definitions.
@@ -351,12 +359,6 @@ struct xfs_inode_log_format_32 {
*/
#define XFS_ILOG_IVERSION 0x8000
-#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
- XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
- XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
- XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \
- XFS_ILOG_AOWNER)
-
#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT)
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 5397a8ff004d..66c7916fb5cd 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -79,6 +79,10 @@ extern const struct xlog_recover_item_ops xlog_xmi_item_ops;
extern const struct xlog_recover_item_ops xlog_xmd_item_ops;
extern const struct xlog_recover_item_ops xlog_rtefi_item_ops;
extern const struct xlog_recover_item_ops xlog_rtefd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtrui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtrud_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcud_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_metadir.c b/fs/xfs/libxfs/xfs_metadir.c
index bae7377c0f22..178e89711cb7 100644
--- a/fs/xfs/libxfs/xfs_metadir.c
+++ b/fs/xfs/libxfs/xfs_metadir.c
@@ -29,6 +29,10 @@
#include "xfs_dir2_priv.h"
#include "xfs_parent.h"
#include "xfs_health.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
/*
* Metadata Directory Tree
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index adeb25d1a444..2f5f554a36d4 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -17,6 +17,28 @@
#include "xfs_metafile.h"
#include "xfs_trace.h"
#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+
+static const struct {
+ enum xfs_metafile_type mtype;
+ const char *name;
+} xfs_metafile_type_strs[] = { XFS_METAFILE_TYPE_STR };
+
+const char *
+xfs_metafile_type_str(enum xfs_metafile_type metatype)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(xfs_metafile_type_strs); i++) {
+ if (xfs_metafile_type_strs[i].mtype == metatype)
+ return xfs_metafile_type_strs[i].name;
+ }
+
+ return NULL;
+}
/* Set up an inode to be recognized as a metadata directory inode. */
void
@@ -50,3 +72,204 @@ xfs_metafile_clear_iflag(
ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
+
+/*
+ * Is the amount of space that could be allocated towards a given metadata
+ * file at or beneath a certain threshold?
+ */
+static inline bool
+xfs_metafile_resv_can_cover(
+ struct xfs_inode *ip,
+ int64_t rhs)
+{
+ /*
+ * The amount of space that can be allocated to this metadata file is
+ * the remaining reservation for the particular metadata file + the
+ * global free block count. Take care of the first case to avoid
+ * touching the per-cpu counter.
+ */
+ if (ip->i_delayed_blks >= rhs)
+ return true;
+
+ /*
+ * There aren't enough blocks left in the inode's reservation, but it
+ * isn't critical unless there also isn't enough free space.
+ */
+ return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
+ rhs - ip->i_delayed_blks, 2048) >= 0;
+}
+
+/*
+ * Is this metadata file critically low on blocks? For now we'll define that
+ * as the number of blocks we can get our hands on being less than 10% of what
+ * we reserved or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_metafile_resv_critical(
+ struct xfs_inode *ip)
+{
+ uint64_t asked_low_water;
+
+ if (!ip)
+ return false;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ trace_xfs_metafile_resv_critical(ip, 0);
+
+ if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
+ return true;
+
+ asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
+ if (!xfs_metafile_resv_can_cover(ip, asked_low_water))
+ return true;
+
+ return XFS_TEST_ERROR(false, ip->i_mount,
+ XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+}
+
+/* Allocate a block from the metadata file's reservation. */
+void
+xfs_metafile_resv_alloc_space(
+ struct xfs_inode *ip,
+ struct xfs_alloc_arg *args)
+{
+ int64_t len = args->len;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ ASSERT(args->resv == XFS_AG_RESV_METAFILE);
+
+ trace_xfs_metafile_resv_alloc_space(ip, args->len);
+
+ /*
+ * Allocate the blocks from the metadata inode's block reservation
+ * and update the ondisk sb counter.
+ */
+ if (ip->i_delayed_blks > 0) {
+ int64_t from_resv;
+
+ from_resv = min_t(int64_t, len, ip->i_delayed_blks);
+ ip->i_delayed_blks -= from_resv;
+ xfs_mod_delalloc(ip, 0, -from_resv);
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
+ -from_resv);
+ len -= from_resv;
+ }
+
+ /*
+ * Any allocation in excess of the reservation requires in-core and
+ * on-disk fdblocks updates. If we can grab @len blocks from the
+ * in-core fdblocks then all we need to do is update the on-disk
+ * superblock; if not, then try to steal some from the transaction's
+ * block reservation. Overruns are only expected for rmap btrees.
+ */
+ if (len) {
+ unsigned int field;
+ int error;
+
+ error = xfs_dec_fdblocks(ip->i_mount, len, true);
+ if (error)
+ field = XFS_TRANS_SB_FDBLOCKS;
+ else
+ field = XFS_TRANS_SB_RES_FDBLOCKS;
+
+ xfs_trans_mod_sb(args->tp, field, -len);
+ }
+
+ ip->i_nblocks += args->len;
+ xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
+}
+
+/* Free a block to the metadata file's reservation. */
+void
+xfs_metafile_resv_free_space(
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ xfs_filblks_t len)
+{
+ int64_t to_resv;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ trace_xfs_metafile_resv_free_space(ip, len);
+
+ ip->i_nblocks -= len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ /*
+ * Add the freed blocks back into the inode's delalloc reservation
+ * until it reaches the maximum size. Update the ondisk fdblocks only.
+ */
+ to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
+ if (to_resv > 0) {
+ to_resv = min_t(int64_t, to_resv, len);
+ ip->i_delayed_blks += to_resv;
+ xfs_mod_delalloc(ip, 0, to_resv);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
+ len -= to_resv;
+ }
+
+ /*
+ * Everything else goes back to the filesystem, so update the in-core
+ * and on-disk counters.
+ */
+ if (len)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
+}
+
+/* Release a metadata file's space reservation. */
+void
+xfs_metafile_resv_free(
+ struct xfs_inode *ip)
+{
+ /* Non-btree metadata inodes don't need space reservations. */
+ if (!ip || !ip->i_meta_resv_asked)
+ return;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ trace_xfs_metafile_resv_free(ip, 0);
+
+ if (ip->i_delayed_blks) {
+ xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks);
+ xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks);
+ ip->i_delayed_blks = 0;
+ }
+ ip->i_meta_resv_asked = 0;
+}
+
+/* Set up a metadata file's space reservation. */
+int
+xfs_metafile_resv_init(
+ struct xfs_inode *ip,
+ xfs_filblks_t ask)
+{
+ xfs_filblks_t hidden_space;
+ xfs_filblks_t used;
+ int error;
+
+ if (!ip || ip->i_meta_resv_asked > 0)
+ return 0;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+
+ /*
+ * Space taken by all other metadata btrees are accounted on-disk as
+ * used space. We therefore only hide the space that is reserved but
+ * not used by the trees.
+ */
+ used = ip->i_nblocks;
+ if (used > ask)
+ ask = used;
+ hidden_space = ask - used;
+
+ error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true);
+ if (error) {
+ trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_);
+ return error;
+ }
+
+ xfs_mod_delalloc(ip, 0, hidden_space);
+ ip->i_delayed_blks = hidden_space;
+ ip->i_meta_resv_asked = ask;
+
+ trace_xfs_metafile_resv_init(ip, ask);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h
index acec400123db..95af4b52e5a7 100644
--- a/fs/xfs/libxfs/xfs_metafile.h
+++ b/fs/xfs/libxfs/xfs_metafile.h
@@ -6,6 +6,8 @@
#ifndef __XFS_METAFILE_H__
#define __XFS_METAFILE_H__
+const char *xfs_metafile_type_str(enum xfs_metafile_type metatype);
+
/* All metadata files must have these flags set. */
#define XFS_METAFILE_DIFLAGS (XFS_DIFLAG_IMMUTABLE | \
XFS_DIFLAG_SYNC | \
@@ -21,6 +23,17 @@ void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip,
enum xfs_metafile_type metafile_type);
void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
+/* Space reservations for metadata inodes. */
+struct xfs_alloc_arg;
+
+bool xfs_metafile_resv_critical(struct xfs_inode *ip);
+void xfs_metafile_resv_alloc_space(struct xfs_inode *ip,
+ struct xfs_alloc_arg *args);
+void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp,
+ xfs_filblks_t len);
+void xfs_metafile_resv_free(struct xfs_inode *ip);
+int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask);
+
/* Code specific to kernel/userspace; must be provided externally. */
int xfs_trans_metafile_iget(struct xfs_trans *tp, xfs_ino_t ino,
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index ad0dedf00f18..a85ecddaa48e 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -83,6 +83,10 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4);
XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48);
+ XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root, 4);
/*
* m68k has problems with struct xfs_attr_leaf_name_remote, but we pad
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 2dbab68b4fe6..cebe83f7842a 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -25,6 +25,9 @@
#include "xfs_ag.h"
#include "xfs_health.h"
#include "xfs_refcount_item.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtrefcount_btree.h"
struct kmem_cache *xfs_refcount_intent_cache;
@@ -128,7 +131,7 @@ xfs_refcount_check_irec(
struct xfs_perag *pag,
const struct xfs_refcount_irec *irec)
{
- if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
+ if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
return __this_address;
if (!xfs_refcount_check_domain(irec))
@@ -138,12 +141,43 @@ xfs_refcount_check_irec(
if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
return __this_address;
- if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
+ if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
return __this_address;
return NULL;
}
+xfs_failaddr_t
+xfs_rtrefcount_check_irec(
+ struct xfs_rtgroup *rtg,
+ const struct xfs_refcount_irec *irec)
+{
+ if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
+ return __this_address;
+
+ if (!xfs_refcount_check_domain(irec))
+ return __this_address;
+
+ /* check for valid extent range, including overflow */
+ if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount))
+ return __this_address;
+
+ if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
+ return __this_address;
+
+ return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_refcount_check_btrec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec)
+{
+ if (xfs_btree_is_rtrefcount(cur->bc_ops))
+ return xfs_rtrefcount_check_irec(to_rtg(cur->bc_group), irec);
+ return xfs_refcount_check_irec(to_perag(cur->bc_group), irec);
+}
+
static inline int
xfs_refcount_complain_bad_rec(
struct xfs_btree_cur *cur,
@@ -152,9 +186,15 @@ xfs_refcount_complain_bad_rec(
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_warn(mp,
+ if (xfs_btree_is_rtrefcount(cur->bc_ops)) {
+ xfs_warn(mp,
+ "RT Refcount BTree record corruption in rtgroup %u detected at %pS!",
+ cur->bc_group->xg_gno, fa);
+ } else {
+ xfs_warn(mp,
"Refcount BTree record corruption in AG %d detected at %pS!",
cur->bc_group->xg_gno, fa);
+ }
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -180,7 +220,7 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
- fa = xfs_refcount_check_irec(to_perag(cur->bc_group), irec);
+ fa = xfs_refcount_check_btrec(cur, irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, irec);
@@ -853,9 +893,9 @@ xfs_refc_merge_refcount(
const struct xfs_refcount_irec *irec,
enum xfs_refc_adjust_op adjust)
{
- /* Once a record hits MAXREFCOUNT, it is pinned there forever */
- if (irec->rc_refcount == MAXREFCOUNT)
- return MAXREFCOUNT;
+ /* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */
+ if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX)
+ return XFS_REFC_REFCOUNT_MAX;
return irec->rc_refcount + adjust;
}
@@ -898,7 +938,7 @@ xfs_refc_want_merge_center(
* hence we need to catch u32 addition overflows here.
*/
ulen += cleft->rc_blockcount + right->rc_blockcount;
- if (ulen >= MAXREFCEXTLEN)
+ if (ulen >= XFS_REFC_LEN_MAX)
return false;
*ulenp = ulen;
@@ -933,7 +973,7 @@ xfs_refc_want_merge_left(
* hence we need to catch u32 addition overflows here.
*/
ulen += cleft->rc_blockcount;
- if (ulen >= MAXREFCEXTLEN)
+ if (ulen >= XFS_REFC_LEN_MAX)
return false;
return true;
@@ -967,7 +1007,7 @@ xfs_refc_want_merge_right(
* hence we need to catch u32 addition overflows here.
*/
ulen += cright->rc_blockcount;
- if (ulen >= MAXREFCEXTLEN)
+ if (ulen >= XFS_REFC_LEN_MAX)
return false;
return true;
@@ -1065,7 +1105,7 @@ xfs_refcount_still_have_space(
*/
overhead = xfs_allocfree_block_count(cur->bc_mp,
cur->bc_refc.shape_changes);
- overhead += cur->bc_mp->m_refc_maxlevels;
+ overhead += cur->bc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
/*
@@ -1085,6 +1125,22 @@ xfs_refcount_still_have_space(
cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
+/* Schedule an extent free. */
+static int
+xrefc_free_extent(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *rec)
+{
+ unsigned int flags = 0;
+
+ if (xfs_btree_is_rtrefcount(cur->bc_ops))
+ flags |= XFS_FREE_EXTENT_REALTIME;
+
+ return xfs_free_extent_later(cur->bc_tp,
+ xfs_gbno_to_fsb(cur->bc_group, rec->rc_startblock),
+ rec->rc_blockcount, NULL, XFS_AG_RESV_NONE, flags);
+}
+
/*
* Adjust the refcounts of middle extents. At this point we should have
* split extents that crossed the adjustment range; merged with adjacent
@@ -1101,7 +1157,6 @@ xfs_refcount_adjust_extents(
struct xfs_refcount_irec ext, tmp;
int error;
int found_rec, found_tmp;
- xfs_fsblock_t fsbno;
/* Merging did all the work already. */
if (*aglen == 0)
@@ -1117,7 +1172,7 @@ xfs_refcount_adjust_extents(
if (error)
goto out_error;
if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
- ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_startblock = xfs_group_max_blocks(cur->bc_group);
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
@@ -1154,11 +1209,7 @@ xfs_refcount_adjust_extents(
goto out_error;
}
} else {
- fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group),
- tmp.rc_startblock);
- error = xfs_free_extent_later(cur->bc_tp, fsbno,
- tmp.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ error = xrefc_free_extent(cur, &tmp);
if (error)
goto out_error;
}
@@ -1196,7 +1247,7 @@ xfs_refcount_adjust_extents(
* Adjust the reference count and either update the tree
* (incr) or free the blocks (decr).
*/
- if (ext.rc_refcount == MAXREFCOUNT)
+ if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX)
goto skip;
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur, &ext);
@@ -1216,11 +1267,7 @@ xfs_refcount_adjust_extents(
}
goto advloop;
} else {
- fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group),
- ext.rc_startblock);
- error = xfs_free_extent_later(cur->bc_tp, fsbno,
- ext.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ error = xrefc_free_extent(cur, &ext);
if (error)
goto out_error;
}
@@ -1417,12 +1464,122 @@ xfs_refcount_finish_one(
}
/*
+ * Set up a continuation a deferred rtrefcount operation by updating the
+ * intent. Checks to make sure we're not going to run off the end of the
+ * rtgroup.
+ */
+static inline int
+xfs_rtrefcount_continue_op(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_intent *ri,
+ xfs_agblock_t new_agbno)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rtgroup *rtg = to_rtg(ri->ri_group);
+
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno,
+ ri->ri_blockcount))) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+
+ ri->ri_startblock = xfs_rgbno_to_rtb(rtg, new_agbno);
+
+ ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount));
+ return 0;
+}
+
+/*
+ * Process one of the deferred realtime refcount operations. We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ */
+int
+xfs_rtrefcount_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_refcount_intent *ri,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rtgroup *rtg = to_rtg(ri->ri_group);
+ struct xfs_btree_cur *rcur = *pcur;
+ int error = 0;
+ xfs_rgblock_t bno;
+ unsigned long nr_ops = 0;
+ int shape_changes = 0;
+
+ bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock);
+
+ trace_xfs_refcount_deferred(mp, ri);
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+ return -EIO;
+
+ /*
+ * If we haven't gotten a cursor or the cursor AG doesn't match
+ * the startblock, get one now.
+ */
+ if (rcur != NULL && rcur->bc_group != ri->ri_group) {
+ nr_ops = rcur->bc_refc.nr_ops;
+ shape_changes = rcur->bc_refc.shape_changes;
+ xfs_btree_del_cursor(rcur, 0);
+ rcur = NULL;
+ *pcur = NULL;
+ }
+ if (rcur == NULL) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_REFCOUNT);
+ *pcur = rcur = xfs_rtrefcountbt_init_cursor(tp, rtg);
+
+ rcur->bc_refc.nr_ops = nr_ops;
+ rcur->bc_refc.shape_changes = shape_changes;
+ }
+
+ switch (ri->ri_type) {
+ case XFS_REFCOUNT_INCREASE:
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_INCREASE);
+ if (error)
+ return error;
+ if (ri->ri_blockcount > 0)
+ error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+ break;
+ case XFS_REFCOUNT_DECREASE:
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_DECREASE);
+ if (error)
+ return error;
+ if (ri->ri_blockcount > 0)
+ error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+ break;
+ case XFS_REFCOUNT_ALLOC_COW:
+ error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
+ if (error)
+ return error;
+ ri->ri_blockcount = 0;
+ break;
+ case XFS_REFCOUNT_FREE_COW:
+ error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
+ if (error)
+ return error;
+ ri->ri_blockcount = 0;
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+ if (!error && ri->ri_blockcount > 0)
+ trace_xfs_refcount_finish_one_leftover(mp, ri);
+ return error;
+}
+
+/*
* Record a refcount intent for later processing.
*/
static void
__xfs_refcount_add(
struct xfs_trans *tp,
enum xfs_refcount_intent_type type,
+ bool isrt,
xfs_fsblock_t startblock,
xfs_extlen_t blockcount)
{
@@ -1434,6 +1591,7 @@ __xfs_refcount_add(
ri->ri_type = type;
ri->ri_startblock = startblock;
ri->ri_blockcount = blockcount;
+ ri->ri_realtime = isrt;
xfs_refcount_defer_add(tp, ri);
}
@@ -1444,12 +1602,13 @@ __xfs_refcount_add(
void
xfs_refcount_increase_extent(
struct xfs_trans *tp,
+ bool isrt,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_has_reflink(tp->t_mountp))
return;
- __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+ __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock,
PREV->br_blockcount);
}
@@ -1459,12 +1618,13 @@ xfs_refcount_increase_extent(
void
xfs_refcount_decrease_extent(
struct xfs_trans *tp,
+ bool isrt,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_has_reflink(tp->t_mountp))
return;
- __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+ __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock,
PREV->br_blockcount);
}
@@ -1666,7 +1826,7 @@ xfs_refcount_adjust_cow_extents(
goto out_error;
}
if (!found_rec) {
- ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_startblock = xfs_group_max_blocks(cur->bc_group);
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
ext.rc_domain = XFS_REFC_DOMAIN_COW;
@@ -1820,6 +1980,7 @@ __xfs_refcount_cow_free(
void
xfs_refcount_alloc_cow_extent(
struct xfs_trans *tp,
+ bool isrt,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
@@ -1828,17 +1989,17 @@ xfs_refcount_alloc_cow_extent(
if (!xfs_has_reflink(mp))
return;
- __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len);
/* Add rmap entry */
- xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
- XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+ xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
}
/* Forget a CoW staging event in the refcount btree. */
void
xfs_refcount_free_cow_extent(
struct xfs_trans *tp,
+ bool isrt,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
@@ -1848,9 +2009,8 @@ xfs_refcount_free_cow_extent(
return;
/* Remove rmap entry */
- xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
- XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
- __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+ xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len);
}
struct xfs_refcount_recovery {
@@ -1879,8 +2039,7 @@ xfs_refcount_recover_extent(
INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- if (xfs_refcount_check_irec(to_perag(cur->bc_group), &rr->rr_rrec) !=
- NULL ||
+ if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
xfs_btree_mark_sick(cur);
@@ -1895,12 +2054,13 @@ xfs_refcount_recover_extent(
/* Find and remove leftover CoW reservations. */
int
xfs_refcount_recover_cow_leftovers(
- struct xfs_mount *mp,
- struct xfs_perag *pag)
+ struct xfs_group *xg)
{
+ struct xfs_mount *mp = xg->xg_mount;
+ bool isrt = xg->xg_type == XG_TYPE_RTG;
struct xfs_trans *tp;
struct xfs_btree_cur *cur;
- struct xfs_buf *agbp;
+ struct xfs_buf *agbp = NULL;
struct xfs_refcount_recovery *rr, *n;
struct list_head debris;
union xfs_btree_irec low = {
@@ -1913,10 +2073,19 @@ xfs_refcount_recover_cow_leftovers(
xfs_fsblock_t fsb;
int error;
- /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
+ /* reflink filesystems must not have groups larger than 2^31-1 blocks */
+ BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG);
BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
- if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
- return -EOPNOTSUPP;
+
+ if (isrt) {
+ if (!xfs_has_rtgroups(mp))
+ return 0;
+ if (xfs_group_max_blocks(xg) >= XFS_MAX_RGBLOCKS)
+ return -EOPNOTSUPP;
+ } else {
+ if (xfs_group_max_blocks(xg) > XFS_MAX_CRC_AG_BLOCKS)
+ return -EOPNOTSUPP;
+ }
INIT_LIST_HEAD(&debris);
@@ -1934,16 +2103,24 @@ xfs_refcount_recover_cow_leftovers(
if (error)
return error;
- error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
- if (error)
- goto out_trans;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+ if (isrt) {
+ xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
+ cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(xg));
+ } else {
+ error = xfs_alloc_read_agf(to_perag(xg), tp, 0, &agbp);
+ if (error)
+ goto out_trans;
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, to_perag(xg));
+ }
/* Find all the leftover CoW staging extents. */
error = xfs_btree_query_range(cur, &low, &high,
xfs_refcount_recover_extent, &debris);
xfs_btree_del_cursor(cur, error);
- xfs_trans_brelse(tp, agbp);
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+ else
+ xfs_rtgroup_unlock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
xfs_trans_cancel(tp);
if (error)
goto out_free;
@@ -1956,14 +2133,15 @@ xfs_refcount_recover_cow_leftovers(
goto out_free;
/* Free the orphan record */
- fsb = xfs_agbno_to_fsb(pag, rr->rr_rrec.rc_startblock);
- xfs_refcount_free_cow_extent(tp, fsb,
+ fsb = xfs_gbno_to_fsb(xg, rr->rr_rrec.rc_startblock);
+ xfs_refcount_free_cow_extent(tp, isrt, fsb,
rr->rr_rrec.rc_blockcount);
/* Free the block. */
error = xfs_free_extent_later(tp, fsb,
rr->rr_rrec.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ XFS_AG_RESV_NONE,
+ isrt ? XFS_FREE_EXTENT_REALTIME : 0);
if (error)
goto out_trans;
@@ -2028,7 +2206,7 @@ xfs_refcount_query_range_helper(
xfs_failaddr_t fa;
xfs_refcount_btrec_to_irec(rec, &irec);
- fa = xfs_refcount_check_irec(to_perag(cur->bc_group), &irec);
+ fa = xfs_refcount_check_btrec(cur, &irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 62d78afcf1f3..f2e299a716a4 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -12,6 +12,7 @@ struct xfs_perag;
struct xfs_btree_cur;
struct xfs_bmbt_irec;
struct xfs_refcount_irec;
+struct xfs_rtgroup;
extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
@@ -60,6 +61,7 @@ struct xfs_refcount_intent {
enum xfs_refcount_intent_type ri_type;
xfs_extlen_t ri_blockcount;
xfs_fsblock_t ri_startblock;
+ bool ri_realtime;
};
/* Check that the refcount is appropriate for the record domain. */
@@ -74,24 +76,25 @@ xfs_refcount_check_domain(
return true;
}
-void xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt,
struct xfs_bmbt_irec *irec);
-void xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt,
struct xfs_bmbt_irec *irec);
-extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+int xfs_refcount_finish_one(struct xfs_trans *tp,
+ struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
+int xfs_rtrefcount_finish_one(struct xfs_trans *tp,
struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
xfs_extlen_t *flen, bool find_end_of_shared);
-void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
- xfs_extlen_t len);
-void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
- xfs_extlen_t len);
-extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
- struct xfs_perag *pag);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt,
+ xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt,
+ xfs_fsblock_t fsb, xfs_extlen_t len);
+int xfs_refcount_recover_cow_leftovers(struct xfs_group *xg);
/*
* While we're adjusting the refcounts records of an extent, we have
@@ -120,6 +123,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
const struct xfs_refcount_irec *irec);
+xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg,
+ const struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index d0df68dc3131..3cdf50563fec 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -25,6 +25,8 @@
#include "xfs_ag.h"
#include "xfs_health.h"
#include "xfs_rmap_item.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
struct kmem_cache *xfs_rmap_intent_cache;
@@ -264,11 +266,77 @@ xfs_rmap_check_irec(
return NULL;
}
+static xfs_failaddr_t
+xfs_rtrmap_check_meta_irec(
+ struct xfs_rtgroup *rtg,
+ const struct xfs_rmap_irec *irec)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (irec->rm_offset != 0)
+ return __this_address;
+ if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+ return __this_address;
+
+ switch (irec->rm_owner) {
+ case XFS_RMAP_OWN_FS:
+ if (irec->rm_startblock != 0)
+ return __this_address;
+ if (irec->rm_blockcount != mp->m_sb.sb_rextsize)
+ return __this_address;
+ return NULL;
+ case XFS_RMAP_OWN_COW:
+ if (!xfs_has_rtreflink(mp))
+ return __this_address;
+ if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
+ irec->rm_blockcount))
+ return __this_address;
+ return NULL;
+ default:
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static xfs_failaddr_t
+xfs_rtrmap_check_inode_irec(
+ struct xfs_rtgroup *rtg,
+ const struct xfs_rmap_irec *irec)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (!xfs_verify_ino(mp, irec->rm_owner))
+ return __this_address;
+ if (!xfs_verify_rgbext(rtg, irec->rm_startblock, irec->rm_blockcount))
+ return __this_address;
+ if (!xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount))
+ return __this_address;
+ return NULL;
+}
+
+xfs_failaddr_t
+xfs_rtrmap_check_irec(
+ struct xfs_rtgroup *rtg,
+ const struct xfs_rmap_irec *irec)
+{
+ if (irec->rm_blockcount == 0)
+ return __this_address;
+ if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK))
+ return __this_address;
+ if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner))
+ return xfs_rtrmap_check_meta_irec(rtg, irec);
+ return xfs_rtrmap_check_inode_irec(rtg, irec);
+}
+
static inline xfs_failaddr_t
xfs_rmap_check_btrec(
struct xfs_btree_cur *cur,
const struct xfs_rmap_irec *irec)
{
+ if (xfs_btree_is_rtrmap(cur->bc_ops) ||
+ xfs_btree_is_mem_rtrmap(cur->bc_ops))
+ return xfs_rtrmap_check_irec(to_rtg(cur->bc_group), irec);
return xfs_rmap_check_irec(to_perag(cur->bc_group), irec);
}
@@ -283,6 +351,10 @@ xfs_rmap_complain_bad_rec(
if (xfs_btree_is_mem_rmap(cur->bc_ops))
xfs_warn(mp,
"In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa);
+ else if (xfs_btree_is_rtrmap(cur->bc_ops))
+ xfs_warn(mp,
+ "RT Reverse Mapping BTree record corruption in rtgroup %u detected at %pS!",
+ cur->bc_group->xg_gno, fa);
else
xfs_warn(mp,
"Reverse Mapping BTree record corruption in AG %d detected at %pS!",
@@ -525,7 +597,7 @@ xfs_rmap_free_check_owner(
struct xfs_btree_cur *cur,
uint64_t ltoff,
struct xfs_rmap_irec *rec,
- xfs_filblks_t len,
+ xfs_extlen_t len,
uint64_t owner,
uint64_t offset,
unsigned int flags)
@@ -2556,6 +2628,47 @@ __xfs_rmap_finish_intent(
}
}
+static int
+xfs_rmap_finish_init_cursor(
+ struct xfs_trans *tp,
+ struct xfs_rmap_intent *ri,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_perag *pag = to_perag(ri->ri_group);
+ struct xfs_buf *agbp = NULL;
+ int error;
+
+ /*
+ * Refresh the freelist before we start changing the rmapbt, because a
+ * shape change could cause us to allocate blocks.
+ */
+ error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
+ if (error) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
+ return error;
+ }
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
+ return -EFSCORRUPTED;
+ }
+ *pcur = xfs_rmapbt_init_cursor(tp->t_mountp, tp, agbp, pag);
+ return 0;
+}
+
+static int
+xfs_rtrmap_finish_init_cursor(
+ struct xfs_trans *tp,
+ struct xfs_rmap_intent *ri,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_rtgroup *rtg = to_rtg(ri->ri_group);
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+ *pcur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ return 0;
+}
+
/*
* Process one of the deferred rmap operations. We pass back the
* btree cursor to maintain our lock on the rmapbt between calls.
@@ -2571,8 +2684,6 @@ xfs_rmap_finish_one(
{
struct xfs_owner_info oinfo;
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_btree_cur *rcur = *pcur;
- struct xfs_buf *agbp = NULL;
xfs_agblock_t bno;
bool unwritten;
int error = 0;
@@ -2586,38 +2697,26 @@ xfs_rmap_finish_one(
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
- if (rcur != NULL && rcur->bc_group != ri->ri_group) {
- xfs_btree_del_cursor(rcur, 0);
- rcur = NULL;
+ if (*pcur != NULL && (*pcur)->bc_group != ri->ri_group) {
+ xfs_btree_del_cursor(*pcur, 0);
*pcur = NULL;
}
- if (rcur == NULL) {
- struct xfs_perag *pag = to_perag(ri->ri_group);
-
- /*
- * Refresh the freelist before we start changing the
- * rmapbt, because a shape change could cause us to
- * allocate blocks.
- */
- error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
- if (error) {
- xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
+ if (*pcur == NULL) {
+ if (ri->ri_group->xg_type == XG_TYPE_RTG)
+ error = xfs_rtrmap_finish_init_cursor(tp, ri, pcur);
+ else
+ error = xfs_rmap_finish_init_cursor(tp, ri, pcur);
+ if (error)
return error;
- }
- if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
- xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
- return -EFSCORRUPTED;
- }
-
- *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
}
xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
ri->ri_bmap.br_startoff);
unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
- bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
- error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+ bno = xfs_fsb_to_gbno(mp, ri->ri_bmap.br_startblock,
+ ri->ri_group->xg_type);
+ error = __xfs_rmap_finish_intent(*pcur, ri->ri_type, bno,
ri->ri_bmap.br_blockcount, &oinfo, unwritten);
if (error)
return error;
@@ -2647,6 +2746,7 @@ __xfs_rmap_add(
struct xfs_trans *tp,
enum xfs_rmap_intent_type type,
uint64_t owner,
+ bool isrt,
int whichfork,
struct xfs_bmbt_irec *bmap)
{
@@ -2658,6 +2758,7 @@ __xfs_rmap_add(
ri->ri_owner = owner;
ri->ri_whichfork = whichfork;
ri->ri_bmap = *bmap;
+ ri->ri_realtime = isrt;
xfs_rmap_defer_add(tp, ri);
}
@@ -2671,6 +2772,7 @@ xfs_rmap_map_extent(
struct xfs_bmbt_irec *PREV)
{
enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+ bool isrt = xfs_ifork_is_realtime(ip, whichfork);
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
@@ -2678,7 +2780,7 @@ xfs_rmap_map_extent(
if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
type = XFS_RMAP_MAP_SHARED;
- __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+ __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
}
/* Unmap an extent out of a file. */
@@ -2690,6 +2792,7 @@ xfs_rmap_unmap_extent(
struct xfs_bmbt_irec *PREV)
{
enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+ bool isrt = xfs_ifork_is_realtime(ip, whichfork);
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
@@ -2697,7 +2800,7 @@ xfs_rmap_unmap_extent(
if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
type = XFS_RMAP_UNMAP_SHARED;
- __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+ __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
}
/*
@@ -2715,6 +2818,7 @@ xfs_rmap_convert_extent(
struct xfs_bmbt_irec *PREV)
{
enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+ bool isrt = xfs_ifork_is_realtime(ip, whichfork);
if (!xfs_rmap_update_is_needed(mp, whichfork))
return;
@@ -2722,15 +2826,15 @@ xfs_rmap_convert_extent(
if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
type = XFS_RMAP_CONVERT_SHARED;
- __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+ __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
}
/* Schedule the creation of an rmap for non-file data. */
void
xfs_rmap_alloc_extent(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
+ bool isrt,
+ xfs_fsblock_t fsbno,
xfs_extlen_t len,
uint64_t owner)
{
@@ -2739,20 +2843,20 @@ xfs_rmap_alloc_extent(
if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
return;
- bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+ bmap.br_startblock = fsbno;
bmap.br_blockcount = len;
bmap.br_startoff = 0;
bmap.br_state = XFS_EXT_NORM;
- __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+ __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, isrt, XFS_DATA_FORK, &bmap);
}
/* Schedule the deletion of an rmap for non-file data. */
void
xfs_rmap_free_extent(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
+ bool isrt,
+ xfs_fsblock_t fsbno,
xfs_extlen_t len,
uint64_t owner)
{
@@ -2761,12 +2865,12 @@ xfs_rmap_free_extent(
if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
return;
- bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+ bmap.br_startblock = fsbno;
bmap.br_blockcount = len;
bmap.br_startoff = 0;
bmap.br_state = XFS_EXT_NORM;
- __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+ __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, isrt, XFS_DATA_FORK, &bmap);
}
/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 96b4321d8310..5f39f6e53cd1 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -7,6 +7,7 @@
#define __XFS_RMAP_H__
struct xfs_perag;
+struct xfs_rtgroup;
static inline void
xfs_rmap_ino_bmbt_owner(
@@ -174,6 +175,7 @@ struct xfs_rmap_intent {
uint64_t ri_owner;
struct xfs_bmbt_irec ri_bmap;
struct xfs_group *ri_group;
+ bool ri_realtime;
};
/* functions for updating the rmapbt based on bmbt map/unmap operations */
@@ -184,10 +186,10 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *imap);
-void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
+ xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_free_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
+ xfs_extlen_t len, uint64_t owner);
int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
struct xfs_btree_cur **pcur);
@@ -206,6 +208,8 @@ xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec);
xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag,
const struct xfs_rmap_irec *irec);
+xfs_failaddr_t xfs_rtrmap_check_irec(struct xfs_rtgroup *rtg,
+ const struct xfs_rmap_irec *irec);
int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, enum xbtree_recpacking *outcome);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 4ddfb7e395b3..770adf60dd73 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1055,7 +1055,7 @@ xfs_rtfree_extent(
xfs_rtxlen_t len) /* length of extent freed */
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
struct xfs_rtalloc_args args = {
.mp = mp,
.tp = tp,
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 16563a44bd13..22e5d9cd95f4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -135,6 +135,15 @@ xfs_rtb_to_rtx(
return div_u64(rtbno, mp->m_sb.sb_rextsize);
}
+/* Return the offset of a rtgroup block number within an rt extent. */
+static inline xfs_extlen_t
+xfs_rgbno_to_rtxoff(
+ struct xfs_mount *mp,
+ xfs_rgblock_t rgbno)
+{
+ return rgbno % mp->m_sb.sb_rextsize;
+}
+
/* Return the offset of an rt block number within an rt extent. */
static inline xfs_extlen_t
xfs_rtb_to_rtxoff(
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 4f3bfc884aff..d84d32f1b48f 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -33,6 +33,8 @@
#include "xfs_rtbitmap.h"
#include "xfs_metafile.h"
#include "xfs_metadir.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
/* Find the first usable fsblock in this rtgroup. */
static inline uint32_t
@@ -197,11 +199,17 @@ xfs_rtgroup_lock(
* Lock both realtime free space metadata inodes for a freespace
* update.
*/
- xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL);
- xfs_ilock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL);
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
- xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED);
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
}
+
+ if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
+ xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
+ if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+ xfs_ilock(rtg_refcount(rtg), XFS_ILOCK_EXCL);
}
/* Unlock metadata inodes associated with this rt group. */
@@ -214,11 +222,17 @@ xfs_rtgroup_unlock(
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
+ if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+ xfs_iunlock(rtg_refcount(rtg), XFS_ILOCK_EXCL);
+
+ if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
+ xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
- xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL);
- xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL);
+ xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
- xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED);
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
}
}
@@ -236,11 +250,15 @@ xfs_rtgroup_trans_join(
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
- xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_BITMAP],
- XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_SUMMARY],
- XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
}
+
+ if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
+ xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
+ if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+ xfs_trans_ijoin(tp, rtg_refcount(rtg), XFS_ILOCK_EXCL);
}
/* Retrieve rt group geometry. */
@@ -284,7 +302,8 @@ xfs_rtginode_ilock_print_fn(
const struct xfs_inode *ip =
container_of(m, struct xfs_inode, i_lock.dep_map);
- printk(KERN_CONT " rgno=%u", ip->i_projid);
+ printk(KERN_CONT " rgno=%u metatype=%s", ip->i_projid,
+ xfs_metafile_type_str(ip->i_metatype));
}
/*
@@ -316,8 +335,10 @@ struct xfs_rtginode_ops {
unsigned int sick; /* rtgroup sickness flag */
+ unsigned int fmt_mask; /* all valid data fork formats */
+
/* Does the fs have this feature? */
- bool (*enabled)(struct xfs_mount *mp);
+ bool (*enabled)(const struct xfs_mount *mp);
/* Create this rtgroup metadata inode and initialize it. */
int (*create)(struct xfs_rtgroup *rtg,
@@ -331,14 +352,40 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.name = "bitmap",
.metafile_type = XFS_METAFILE_RTBITMAP,
.sick = XFS_SICK_RG_BITMAP,
+ .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
+ (1U << XFS_DINODE_FMT_BTREE),
.create = xfs_rtbitmap_create,
},
[XFS_RTGI_SUMMARY] = {
.name = "summary",
.metafile_type = XFS_METAFILE_RTSUMMARY,
.sick = XFS_SICK_RG_SUMMARY,
+ .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
+ (1U << XFS_DINODE_FMT_BTREE),
.create = xfs_rtsummary_create,
},
+ [XFS_RTGI_RMAP] = {
+ .name = "rmap",
+ .metafile_type = XFS_METAFILE_RTRMAP,
+ .sick = XFS_SICK_RG_RMAPBT,
+ .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE,
+ /*
+ * growfs must create the rtrmap inodes before adding a
+ * realtime volume to the filesystem, so we cannot use the
+ * rtrmapbt predicate here.
+ */
+ .enabled = xfs_has_rmapbt,
+ .create = xfs_rtrmapbt_create,
+ },
+ [XFS_RTGI_REFCOUNT] = {
+ .name = "refcount",
+ .metafile_type = XFS_METAFILE_RTREFCOUNT,
+ .sick = XFS_SICK_RG_REFCNTBT,
+ .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE,
+ /* same comment about growfs and rmap inodes applies here */
+ .enabled = xfs_has_reflink,
+ .create = xfs_rtrefcountbt_create,
+ },
};
/* Return the shortname of this rtgroup inode. */
@@ -435,8 +482,7 @@ xfs_rtginode_load(
return error;
}
- if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) {
+ if (XFS_IS_CORRUPT(mp, !((1U << ip->i_df.if_format) & ops->fmt_mask))) {
xfs_irele(ip);
xfs_rtginode_mark_sick(rtg, type);
return -EFSCORRUPTED;
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 2d7822644eff..03f39d4e43fc 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -14,6 +14,8 @@ struct xfs_trans;
enum xfs_rtg_inodes {
XFS_RTGI_BITMAP, /* allocation bitmap */
XFS_RTGI_SUMMARY, /* allocation summary */
+ XFS_RTGI_RMAP, /* rmap btree inode */
+ XFS_RTGI_REFCOUNT, /* refcount btree inode */
XFS_RTGI_MAX,
};
@@ -64,6 +66,26 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
return rtg->rtg_group.xg_gno;
}
+static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_inodes[XFS_RTGI_BITMAP];
+}
+
+static inline struct xfs_inode *rtg_summary(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+}
+
+static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_inodes[XFS_RTGI_RMAP];
+}
+
+static inline struct xfs_inode *rtg_refcount(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_inodes[XFS_RTGI_REFCOUNT];
+}
+
/* Passive rtgroup references */
static inline struct xfs_rtgroup *
xfs_rtgroup_get(
@@ -122,6 +144,32 @@ xfs_rtgroup_next(
return xfs_rtgroup_next_range(mp, rtg, 0, mp->m_sb.sb_rgcount - 1);
}
+static inline bool
+xfs_verify_rgbno(
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t rgbno)
+{
+ ASSERT(xfs_has_rtgroups(rtg_mount(rtg)));
+
+ return xfs_verify_gbno(rtg_group(rtg), rgbno);
+}
+
+/*
+ * Check that [@rgbno,@len] is a valid extent range in @rtg.
+ *
+ * Must only be used for RTG-enabled file systems.
+ */
+static inline bool
+xfs_verify_rgbext(
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t rgbno,
+ xfs_extlen_t len)
+{
+ ASSERT(xfs_has_rtgroups(rtg_mount(rtg)));
+
+ return xfs_verify_gbext(rtg_group(rtg), rgbno, len);
+}
+
static inline xfs_rtblock_t
xfs_rgbno_to_rtb(
struct xfs_rtgroup *rtg,
@@ -223,9 +271,15 @@ int xfs_update_last_rtgroup_size(struct xfs_mount *mp,
#define XFS_RTGLOCK_BITMAP (1U << 0)
/* Lock the rt bitmap inode in shared mode */
#define XFS_RTGLOCK_BITMAP_SHARED (1U << 1)
+/* Lock the rt rmap inode in exclusive mode */
+#define XFS_RTGLOCK_RMAP (1U << 2)
+/* Lock the rt refcount inode in exclusive mode */
+#define XFS_RTGLOCK_REFCOUNT (1U << 3)
#define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \
- XFS_RTGLOCK_BITMAP_SHARED)
+ XFS_RTGLOCK_BITMAP_SHARED | \
+ XFS_RTGLOCK_RMAP | \
+ XFS_RTGLOCK_REFCOUNT)
void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
@@ -248,6 +302,8 @@ int xfs_rtginode_create(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type,
bool init);
void xfs_rtginode_irele(struct xfs_inode **ipp);
+void xfs_rtginode_irele(struct xfs_inode **ipp);
+
static inline const char *xfs_rtginode_path(xfs_rgnumber_t rgno,
enum xfs_rtg_inodes type)
{
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
new file mode 100644
index 000000000000..3db5e7a4a945
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -0,0 +1,757 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_metafile.h"
+#include "xfs_health.h"
+
+static struct kmem_cache *xfs_rtrefcountbt_cur_cache;
+
+/*
+ * Realtime Reference Count btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device. See the comments in xfs_refcount_btree.c for more information.
+ *
+ * This tree is basically the same as the regular refcount btree except that
+ * it's rooted in an inode.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrefcountbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
+}
+
+STATIC int
+xfs_rtrefcountbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0) / 2;
+ }
+
+ return cur->bc_mp->m_rtrefc_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrefcountbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0);
+ }
+
+ return cur->bc_mp->m_rtrefc_mxr[level != 0];
+}
+
+/*
+ * Calculate number of records in a realtime refcount btree inode root.
+ */
+unsigned int
+xfs_rtrefcountbt_droot_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= sizeof(struct xfs_rtrefcount_root);
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (2 * sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork so that
+ * we can resize the in-memory buffer to match it. After a resize to the
+ * maximum size this function returns the same value as
+ * xfs_rtrefcountbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrefcountbt_get_dmaxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level != cur->bc_nlevels - 1)
+ return cur->bc_mp->m_rtrefc_mxr[level != 0];
+ return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_rtrefcountbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->refc.rc_startblock);
+ x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+ key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec->refc.rc_startblock = cpu_to_be32(start);
+ rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+ rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ptr->l = 0;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ const struct xfs_refcount_key *kp = &key->refc;
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->refc.rc_startblock);
+
+ return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+static xfs_failaddr_t
+xfs_rtrefcountbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ int level;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (!xfs_has_reflink(mp))
+ return __this_address;
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+ level = be16_to_cpu(block->bb_level);
+ if (level > mp->m_rtrefc_maxlevels)
+ return __this_address;
+
+ return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]);
+}
+
+static void
+xfs_rtrefcountbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_fsblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_rtrefcountbt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrefcountbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_rtrefcountbt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_fsblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = {
+ .name = "xfs_rtrefcountbt",
+ .magic = { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) },
+ .verify_read = xfs_rtrefcountbt_read_verify,
+ .verify_write = xfs_rtrefcountbt_write_verify,
+ .verify_struct = xfs_rtrefcountbt_verify,
+};
+
+STATIC int
+xfs_rtrefcountbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->refc.rc_startblock) <
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_rtrefcountbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->refc.rc_startblock) +
+ be32_to_cpu(r1->refc.rc_blockcount) <=
+ be32_to_cpu(r2->refc.rc_startblock);
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrefcountbt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->refc.rc_startblock);
+
+ return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock),
+ be32_to_cpu(key2->refc.rc_startblock));
+}
+
+static inline void
+xfs_rtrefcountbt_move_ptrs(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *broot,
+ short old_size,
+ size_t new_size,
+ unsigned int numrecs)
+{
+ void *dptr;
+ void *sptr;
+
+ sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size);
+ dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size);
+ memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+static struct xfs_btree_block *
+xfs_rtrefcountbt_broot_realloc(
+ struct xfs_btree_cur *cur,
+ unsigned int new_numrecs)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+ struct xfs_btree_block *broot;
+ unsigned int new_size;
+ unsigned int old_size = ifp->if_broot_bytes;
+ const unsigned int level = cur->bc_nlevels - 1;
+
+ new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs);
+
+ /* Handle the nop case quietly. */
+ if (new_size == old_size)
+ return ifp->if_broot;
+
+ if (new_size > old_size) {
+ unsigned int old_numrecs;
+
+ /*
+ * If there wasn't any memory allocated before, just allocate
+ * it now and get out.
+ */
+ if (old_size == 0)
+ return xfs_broot_realloc(ifp, new_size);
+
+ /*
+ * If there is already an existing if_broot, then we need to
+ * realloc it and possibly move the node block pointers because
+ * those are not butted up against the btree block header.
+ */
+ old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level);
+ broot = xfs_broot_realloc(ifp, new_size);
+ if (level > 0)
+ xfs_rtrefcountbt_move_ptrs(mp, broot, old_size,
+ new_size, old_numrecs);
+ goto out_broot;
+ }
+
+ /*
+ * We're reducing numrecs. If we're going all the way to zero, just
+ * free the block.
+ */
+ ASSERT(ifp->if_broot != NULL && old_size > 0);
+ if (new_size == 0)
+ return xfs_broot_realloc(ifp, 0);
+
+ /*
+ * Shrink the btree root by possibly moving the rtrmapbt pointers,
+ * since they are not butted up against the btree block header. Then
+ * reallocate broot.
+ */
+ if (level > 0)
+ xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size,
+ new_size, new_numrecs);
+ broot = xfs_broot_realloc(ifp, new_size);
+
+out_broot:
+ ASSERT(xfs_rtrefcount_droot_space(broot) <=
+ xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork));
+ return broot;
+}
+
+const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
+ .name = "rtrefcount",
+ .type = XFS_BTREE_TYPE_INODE,
+ .geom_flags = XFS_BTGEO_IROOT_RECORDS,
+
+ .rec_len = sizeof(struct xfs_refcount_rec),
+ .key_len = sizeof(struct xfs_refcount_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_REFC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rtrefcbt_2),
+ .sick_mask = XFS_SICK_RG_REFCNTBT,
+
+ .dup_cursor = xfs_rtrefcountbt_dup_cursor,
+ .alloc_block = xfs_btree_alloc_metafile_block,
+ .free_block = xfs_btree_free_metafile_block,
+ .get_minrecs = xfs_rtrefcountbt_get_minrecs,
+ .get_maxrecs = xfs_rtrefcountbt_get_maxrecs,
+ .get_dmaxrecs = xfs_rtrefcountbt_get_dmaxrecs,
+ .init_key_from_rec = xfs_rtrefcountbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur,
+ .key_diff = xfs_rtrefcountbt_key_diff,
+ .buf_ops = &xfs_rtrefcountbt_buf_ops,
+ .diff_two_keys = xfs_rtrefcountbt_diff_two_keys,
+ .keys_inorder = xfs_rtrefcountbt_keys_inorder,
+ .recs_inorder = xfs_rtrefcountbt_recs_inorder,
+ .keys_contiguous = xfs_rtrefcountbt_keys_contiguous,
+ .broot_realloc = xfs_rtrefcountbt_broot_realloc,
+};
+
+/* Allocate a new rt refcount btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrefcountbt_init_cursor(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_inode *ip = rtg_refcount(rtg);
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_btree_cur *cur;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops,
+ mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache);
+
+ cur->bc_ino.ip = ip;
+ cur->bc_refc.nr_ops = 0;
+ cur->bc_refc.shape_changes = 0;
+ cur->bc_group = xfs_group_hold(rtg_group(rtg));
+ cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+ cur->bc_ino.whichfork = XFS_DATA_FORK;
+ return cur;
+}
+
+/*
+ * Install a new rt reverse mapping btree root. Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrefcountbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp)
+{
+ struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake;
+ struct xfs_ifork *ifp;
+ int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
+
+ /*
+ * Free any resources hanging off the real fork, then shallow-copy the
+ * staging fork's contents into the real fork to transfer everything
+ * we just built.
+ */
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+ cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
+ xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+ xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
+}
+
+/* Calculate number of records in a realtime refcount btree block. */
+static inline unsigned int
+xfs_rtrefcountbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount btree block.
+ */
+unsigned int
+xfs_rtrefcountbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= XFS_RTREFCOUNT_BLOCK_LEN;
+ return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime refcount btrees. */
+unsigned int
+xfs_rtrefcountbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2;
+
+ /* We need at most one record for every block in an rt group. */
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+}
+
+int __init
+xfs_rtrefcountbt_init_cur_cache(void)
+{
+ xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur",
+ xfs_btree_cur_sizeof(
+ xfs_rtrefcountbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rtrefcountbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rtrefcountbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rtrefcountbt_cur_cache);
+ xfs_rtrefcountbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of a realtime refcount btree. */
+void
+xfs_rtrefcountbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ unsigned int d_maxlevels, r_maxlevels;
+
+ if (!xfs_has_rtreflink(mp)) {
+ mp->m_rtrefc_maxlevels = 0;
+ return;
+ }
+
+ /*
+ * The realtime refcountbt lives on the data device, which means that
+ * its maximum height is constrained by the size of the data device and
+ * the height required to store one refcount record for each rtextent
+ * in an rt group.
+ */
+ d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr,
+ mp->m_sb.sb_dblocks);
+ r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr,
+ mp->m_sb.sb_rgextents);
+
+ /* Add one level to handle the inode root level. */
+ mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
+
+/* Calculate the rtrefcount btree size for some records. */
+unsigned long long
+xfs_rtrefcountbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_rtrefc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+static unsigned long long
+xfs_rtrefcountbt_max_size(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtblocks)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_rtrefc_mxr[0] == 0)
+ return 0;
+
+ return xfs_rtrefcountbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ * We need enough space to hold one record for every rt extent in the rtgroup.
+ */
+xfs_filblks_t
+xfs_rtrefcountbt_calc_reserves(
+ struct xfs_mount *mp)
+{
+ if (!xfs_has_rtreflink(mp))
+ return 0;
+
+ return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents);
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+STATIC void
+xfs_rtrefcountbt_from_disk(
+ struct xfs_inode *ip,
+ struct xfs_rtrefcount_root *dblock,
+ int dblocklen,
+ struct xfs_btree_block *rblock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_refcount_key *fkp;
+ __be64 *fpp;
+ struct xfs_refcount_key *tkp;
+ __be64 *tpp;
+ struct xfs_refcount_rec *frp;
+ struct xfs_refcount_rec *trp;
+ unsigned int numrecs;
+ unsigned int maxrecs;
+ unsigned int rblocklen;
+
+ rblocklen = xfs_rtrefcount_broot_space(mp, dblock);
+
+ xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0,
+ ip->i_ino);
+
+ rblock->bb_level = dblock->bb_level;
+ rblock->bb_numrecs = dblock->bb_numrecs;
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+ tkp = xfs_rtrefcount_key_addr(rblock, 1);
+ fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+ tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ numrecs = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+ trp = xfs_rtrefcount_rec_addr(rblock, 1);
+ numrecs = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Load a realtime reference count btree root in from disk. */
+int
+xfs_iformat_rtrefcount(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ struct xfs_btree_block *broot;
+ unsigned int numrecs;
+ unsigned int level;
+ int dsize;
+
+ /*
+ * growfs must create the rtrefcount inodes before adding a realtime
+ * volume to the filesystem, so we cannot use the rtrefcount predicate
+ * here.
+ */
+ if (!xfs_has_reflink(ip->i_mount)) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+ numrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > mp->m_rtrefc_maxlevels ||
+ xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
+ xfs_rtrefcount_broot_space_calc(mp, level, numrecs));
+ if (broot)
+ xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot);
+ return 0;
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_rtrefcountbt_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *rblock,
+ int rblocklen,
+ struct xfs_rtrefcount_root *dblock,
+ int dblocklen)
+{
+ struct xfs_refcount_key *fkp;
+ __be64 *fpp;
+ struct xfs_refcount_key *tkp;
+ __be64 *tpp;
+ struct xfs_refcount_rec *frp;
+ struct xfs_refcount_rec *trp;
+ unsigned int maxrecs;
+ unsigned int numrecs;
+
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+ ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+ ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+ ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+ dblock->bb_level = rblock->bb_level;
+ dblock->bb_numrecs = rblock->bb_numrecs;
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrefcount_key_addr(rblock, 1);
+ tkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+ fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+ numrecs = be16_to_cpu(rblock->bb_numrecs);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrefcount_rec_addr(rblock, 1);
+ trp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+ numrecs = be16_to_cpu(rblock->bb_numrecs);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Flush a realtime reference count btree root out to disk. */
+void
+xfs_iflush_rtrefcount(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+ ASSERT(ifp->if_broot != NULL);
+ ASSERT(ifp->if_broot_bytes > 0);
+ ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <=
+ xfs_inode_fork_size(ip, XFS_DATA_FORK));
+ xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot,
+ ifp->if_broot_bytes, dfp,
+ XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
+
+/*
+ * Create a realtime refcount btree inode.
+ */
+int
+xfs_rtrefcountbt_create(
+ struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *broot;
+
+ ifp->if_format = XFS_DINODE_FMT_META_BTREE;
+ ASSERT(ifp->if_broot_bytes == 0);
+ ASSERT(ifp->if_bytes == 0);
+
+ /* Initialize the empty incore btree root. */
+ broot = xfs_broot_realloc(ifp,
+ xfs_rtrefcount_broot_space_calc(mp, 0, 0));
+ if (broot)
+ xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0,
+ ip->i_ino);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
new file mode 100644
index 000000000000..a99b7a8aec86
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTREFCOUNT_BTREE_H__
+#define __XFS_RTREFCOUNT_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+
+/* refcounts only exist on crc enabled filesystems */
+#define XFS_RTREFCOUNT_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg);
+struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xbtree_ifakeroot *ifake);
+void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp);
+unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp,
+ unsigned int blocklen, bool leaf);
+void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrefcountbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_rec_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_rec *)
+ ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+ (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_key_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_key *)
+ ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+ (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_ptr_addr(
+ struct xfs_btree_block *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrefcount_ptr_t *)
+ ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+ maxrecs * sizeof(struct xfs_refcount_key) +
+ (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void);
+int __init xfs_rtrefcountbt_init_cur_cache(void);
+void xfs_rtrefcountbt_destroy_cur_cache(void);
+
+xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp);
+unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+
+/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */
+
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_droot_rec_addr(
+ struct xfs_rtrefcount_root *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_rec *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_droot_key_addr(
+ struct xfs_rtrefcount_root *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_key *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_droot_ptr_addr(
+ struct xfs_rtrefcount_root *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrefcount_ptr_t *)
+ ((char *)(block + 1) +
+ maxrecs * sizeof(struct xfs_refcount_key) +
+ (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_broot_ptr_addr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *bb,
+ unsigned int index,
+ unsigned int block_size)
+{
+ return xfs_rtrefcount_ptr_addr(bb, index,
+ xfs_rtrefcountbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space_calc(
+ struct xfs_mount *mp,
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = XFS_RTREFCOUNT_BLOCK_LEN;
+
+ if (level > 0)
+ return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb)
+{
+ return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrefcount_droot_space_calc(
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = sizeof(struct xfs_rtrefcount_root);
+
+ if (level > 0)
+ return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrefcount_droot_space(struct xfs_btree_block *bb)
+{
+ return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp,
+ struct xfs_btree_block *rblock, int rblocklen,
+ struct xfs_rtrefcount_root *dblock, int dblocklen);
+void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+
+int xfs_rtrefcountbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xfs_trans *tp, bool init);
+
+#endif /* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
new file mode 100644
index 000000000000..e4ec36943cb7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -0,0 +1,1035 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_metafile.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_rtgroup.h"
+#include "xfs_bmap.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+
+static struct kmem_cache *xfs_rtrmapbt_cur_cache;
+
+/*
+ * Realtime Reverse Map btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device. See the comments in xfs_rmap_btree.c for more information.
+ *
+ * This tree is basically the same as the regular rmap btree except that it
+ * is rooted in an inode and does not live in free space.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrmapbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
+}
+
+STATIC int
+xfs_rtrmapbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0) / 2;
+ }
+
+ return cur->bc_mp->m_rtrmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrmapbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0);
+ }
+
+ return cur->bc_mp->m_rtrmap_mxr[level != 0];
+}
+
+/* Calculate number of records in the ondisk realtime rmap btree inode root. */
+unsigned int
+xfs_rtrmapbt_droot_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= sizeof(struct xfs_rtrmap_root);
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen / (2 * sizeof(struct xfs_rmap_key) +
+ sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it. After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_rtrmapbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrmapbt_get_dmaxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level != cur->bc_nlevels - 1)
+ return cur->bc_mp->m_rtrmap_mxr[level != 0];
+ return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
+/*
+ * Convert the ondisk record's offset field into the ondisk key's offset field.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
+{
+ return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+}
+
+STATIC void
+xfs_rtrmapbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+}
+
+STATIC void
+xfs_rtrmapbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ uint64_t off;
+ int adj;
+
+ adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ be32_add_cpu(&key->rmap.rm_startblock, adj);
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+ if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+ XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+ return;
+ off = be64_to_cpu(key->rmap.rm_offset);
+ off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+ key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rtrmapbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+ rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+ rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+ rec->rmap.rm_offset = cpu_to_be64(
+ xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rtrmapbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ptr->l = 0;
+}
+
+/*
+ * Mask the appropriate parts of the ondisk key field for a key comparison.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline uint64_t offset_keymask(uint64_t offset)
+{
+ return offset & ~XFS_RMAP_OFF_UNWRITTEN;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct xfs_rmap_irec *rec = &cur->bc_rec.r;
+ const struct xfs_rmap_key *kp = &key->rmap;
+ __u64 x, y;
+ int64_t d;
+
+ d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+ if (d)
+ return d;
+
+ x = be64_to_cpu(kp->rm_owner);
+ y = rec->rm_owner;
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+
+ x = offset_keymask(be64_to_cpu(kp->rm_offset));
+ y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ return 0;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ const struct xfs_rmap_key *kp1 = &k1->rmap;
+ const struct xfs_rmap_key *kp2 = &k2->rmap;
+ int64_t d;
+ __u64 x, y;
+
+ /* Doesn't make sense to mask off the physical space part */
+ ASSERT(!mask || mask->rmap.rm_startblock);
+
+ d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
+ be32_to_cpu(kp2->rm_startblock);
+ if (d)
+ return d;
+
+ if (!mask || mask->rmap.rm_owner) {
+ x = be64_to_cpu(kp1->rm_owner);
+ y = be64_to_cpu(kp2->rm_owner);
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ }
+
+ if (!mask || mask->rmap.rm_offset) {
+ /* Doesn't make sense to allow offset but not owner */
+ ASSERT(!mask || mask->rmap.rm_owner);
+
+ x = offset_keymask(be64_to_cpu(kp1->rm_offset));
+ y = offset_keymask(be64_to_cpu(kp2->rm_offset));
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ }
+
+ return 0;
+}
+
+static xfs_failaddr_t
+xfs_rtrmapbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ int level;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (!xfs_has_rmapbt(mp))
+ return __this_address;
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+ level = be16_to_cpu(block->bb_level);
+ if (level > mp->m_rtrmap_maxlevels)
+ return __this_address;
+
+ return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]);
+}
+
+static void
+xfs_rtrmapbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_fsblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_rtrmapbt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrmapbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_rtrmapbt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_fsblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
+ .name = "xfs_rtrmapbt",
+ .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+ .verify_read = xfs_rtrmapbt_read_verify,
+ .verify_write = xfs_rtrmapbt_write_verify,
+ .verify_struct = xfs_rtrmapbt_verify,
+};
+
+STATIC int
+xfs_rtrmapbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ uint32_t x;
+ uint32_t y;
+ uint64_t a;
+ uint64_t b;
+
+ x = be32_to_cpu(k1->rmap.rm_startblock);
+ y = be32_to_cpu(k2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(k1->rmap.rm_owner);
+ b = be64_to_cpu(k2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
+ b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+
+STATIC int
+xfs_rtrmapbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ uint32_t x;
+ uint32_t y;
+ uint64_t a;
+ uint64_t b;
+
+ x = be32_to_cpu(r1->rmap.rm_startblock);
+ y = be32_to_cpu(r2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(r1->rmap.rm_owner);
+ b = be64_to_cpu(r2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
+ b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrmapbt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->rmap.rm_startblock);
+
+ /*
+ * We only support checking contiguity of the physical space component.
+ * If any callers ever need more specificity than that, they'll have to
+ * implement it here.
+ */
+ ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
+
+ return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
+ be32_to_cpu(key2->rmap.rm_startblock));
+}
+
+static inline void
+xfs_rtrmapbt_move_ptrs(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *broot,
+ short old_size,
+ size_t new_size,
+ unsigned int numrecs)
+{
+ void *dptr;
+ void *sptr;
+
+ sptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, old_size);
+ dptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, new_size);
+ memmove(dptr, sptr, numrecs * sizeof(xfs_rtrmap_ptr_t));
+}
+
+static struct xfs_btree_block *
+xfs_rtrmapbt_broot_realloc(
+ struct xfs_btree_cur *cur,
+ unsigned int new_numrecs)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+ struct xfs_btree_block *broot;
+ unsigned int new_size;
+ unsigned int old_size = ifp->if_broot_bytes;
+ const unsigned int level = cur->bc_nlevels - 1;
+
+ new_size = xfs_rtrmap_broot_space_calc(mp, level, new_numrecs);
+
+ /* Handle the nop case quietly. */
+ if (new_size == old_size)
+ return ifp->if_broot;
+
+ if (new_size > old_size) {
+ unsigned int old_numrecs;
+
+ /*
+ * If there wasn't any memory allocated before, just allocate
+ * it now and get out.
+ */
+ if (old_size == 0)
+ return xfs_broot_realloc(ifp, new_size);
+
+ /*
+ * If there is already an existing if_broot, then we need to
+ * realloc it and possibly move the node block pointers because
+ * those are not butted up against the btree block header.
+ */
+ old_numrecs = xfs_rtrmapbt_maxrecs(mp, old_size, level == 0);
+ broot = xfs_broot_realloc(ifp, new_size);
+ if (level > 0)
+ xfs_rtrmapbt_move_ptrs(mp, broot, old_size, new_size,
+ old_numrecs);
+ goto out_broot;
+ }
+
+ /*
+ * We're reducing numrecs. If we're going all the way to zero, just
+ * free the block.
+ */
+ ASSERT(ifp->if_broot != NULL && old_size > 0);
+ if (new_size == 0)
+ return xfs_broot_realloc(ifp, 0);
+
+ /*
+ * Shrink the btree root by possibly moving the rtrmapbt pointers,
+ * since they are not butted up against the btree block header. Then
+ * reallocate broot.
+ */
+ if (level > 0)
+ xfs_rtrmapbt_move_ptrs(mp, ifp->if_broot, old_size, new_size,
+ new_numrecs);
+ broot = xfs_broot_realloc(ifp, new_size);
+
+out_broot:
+ ASSERT(xfs_rtrmap_droot_space(broot) <=
+ xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork));
+ return broot;
+}
+
+const struct xfs_btree_ops xfs_rtrmapbt_ops = {
+ .name = "rtrmap",
+ .type = XFS_BTREE_TYPE_INODE,
+ .geom_flags = XFS_BTGEO_OVERLAPPING |
+ XFS_BTGEO_IROOT_RECORDS,
+
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_2),
+ .sick_mask = XFS_SICK_RG_RMAPBT,
+
+ .dup_cursor = xfs_rtrmapbt_dup_cursor,
+ .alloc_block = xfs_btree_alloc_metafile_block,
+ .free_block = xfs_btree_free_metafile_block,
+ .get_minrecs = xfs_rtrmapbt_get_minrecs,
+ .get_maxrecs = xfs_rtrmapbt_get_maxrecs,
+ .get_dmaxrecs = xfs_rtrmapbt_get_dmaxrecs,
+ .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur,
+ .key_diff = xfs_rtrmapbt_key_diff,
+ .buf_ops = &xfs_rtrmapbt_buf_ops,
+ .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .keys_inorder = xfs_rtrmapbt_keys_inorder,
+ .recs_inorder = xfs_rtrmapbt_recs_inorder,
+ .keys_contiguous = xfs_rtrmapbt_keys_contiguous,
+ .broot_realloc = xfs_rtrmapbt_broot_realloc,
+};
+
+/* Allocate a new rt rmap btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_init_cursor(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_inode *ip = rtg_rmap(rtg);
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_btree_cur *cur;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops,
+ mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
+
+ cur->bc_ino.ip = ip;
+ cur->bc_group = xfs_group_hold(rtg_group(rtg));
+ cur->bc_ino.whichfork = XFS_DATA_FORK;
+ cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+
+ return cur;
+}
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+/*
+ * Validate an in-memory realtime rmap btree block. Callers are allowed to
+ * generate an in-memory btree even if the ondisk feature is not enabled.
+ */
+static xfs_failaddr_t
+xfs_rtrmapbt_mem_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (xfs_has_rmapbt(mp)) {
+ if (level >= mp->m_rtrmap_maxlevels)
+ return __this_address;
+ } else {
+ if (level >= xfs_rtrmapbt_maxlevels_ondisk())
+ return __this_address;
+ }
+
+ maxrecs = xfs_rtrmapbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+xfs_rtrmapbt_mem_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = xfs_rtrmapbt_mem_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = {
+ .name = "xfs_rtrmapbt_mem",
+ .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+ .verify_read = xfs_rtrmapbt_mem_rw_verify,
+ .verify_write = xfs_rtrmapbt_mem_rw_verify,
+ .verify_struct = xfs_rtrmapbt_mem_verify,
+};
+
+const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
+ .type = XFS_BTREE_TYPE_MEM,
+ .geom_flags = XFS_BTGEO_OVERLAPPING,
+
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_mem_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = xfs_rtrmapbt_key_diff,
+ .buf_ops = &xfs_rtrmapbt_mem_buf_ops,
+ .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .keys_inorder = xfs_rtrmapbt_keys_inorder,
+ .recs_inorder = xfs_rtrmapbt_recs_inorder,
+ .keys_contiguous = xfs_rtrmapbt_keys_contiguous,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_mem_cursor(
+ struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp,
+ struct xfbtree *xfbt)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_mem_ops,
+ mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
+ cur->bc_mem.xfbtree = xfbt;
+ cur->bc_nlevels = xfbt->nlevels;
+ cur->bc_group = xfs_group_hold(rtg_group(rtg));
+ return cur;
+}
+
+/* Create an in-memory realtime rmap btree. */
+int
+xfs_rtrmapbt_mem_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp,
+ xfs_rgnumber_t rgno)
+{
+ xfbt->owner = rgno;
+ return xfbtree_init(mp, xfbt, btp, &xfs_rtrmapbt_mem_ops);
+}
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+/*
+ * Install a new rt reverse mapping btree root. Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrmapbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp)
+{
+ struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake;
+ struct xfs_ifork *ifp;
+ int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
+
+ /*
+ * Free any resources hanging off the real fork, then shallow-copy the
+ * staging fork's contents into the real fork to transfer everything
+ * we just built.
+ */
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+ cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
+ xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+ xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
+}
+
+/* Calculate number of records in a rt reverse mapping btree block. */
+static inline unsigned int
+xfs_rtrmapbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Calculate number of records in an rt reverse mapping btree block.
+ */
+unsigned int
+xfs_rtrmapbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= XFS_RTRMAP_BLOCK_LEN;
+ return xfs_rtrmapbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime reverse mapping btrees. */
+unsigned int
+xfs_rtrmapbt_maxlevels_ondisk(void)
+{
+ unsigned long long max_dblocks;
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs.
+ *
+ * On a reflink filesystem, each block in an rtgroup can have up to
+ * 2^32 (per the refcount record format) owners, which means that
+ * theoretically we could face up to 2^64 rmap records. However, we're
+ * likely to run out of blocks in the data device long before that
+ * happens, which means that we must compute the max height based on
+ * what the btree will look like if it consumes almost all the blocks
+ * in the data device due to maximal sharing factor.
+ */
+ max_dblocks = -1U; /* max ag count */
+ max_dblocks *= XFS_MAX_CRC_AG_BLOCKS;
+ return xfs_btree_space_to_height(minrecs, max_dblocks);
+}
+
+int __init
+xfs_rtrmapbt_init_cur_cache(void)
+{
+ xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur",
+ xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rtrmapbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rtrmapbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rtrmapbt_cur_cache);
+ xfs_rtrmapbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of an rt reverse mapping btree. */
+void
+xfs_rtrmapbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ unsigned int d_maxlevels, r_maxlevels;
+
+ if (!xfs_has_rtrmapbt(mp)) {
+ mp->m_rtrmap_maxlevels = 0;
+ return;
+ }
+
+ /*
+ * The realtime rmapbt lives on the data device, which means that its
+ * maximum height is constrained by the size of the data device and
+ * the height required to store one rmap record for each block in an
+ * rt group.
+ *
+ * On a reflink filesystem, each rt block can have up to 2^32 (per the
+ * refcount record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records. This makes the computation of
+ * maxlevels based on record count meaningless, so we only consider the
+ * size of the data device.
+ */
+ d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
+ mp->m_sb.sb_dblocks);
+ if (xfs_has_rtreflink(mp)) {
+ mp->m_rtrmap_maxlevels = d_maxlevels + 1;
+ return;
+ }
+
+ r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
+ mp->m_groups[XG_TYPE_RTG].blocks);
+
+ /* Add one level to handle the inode root level. */
+ mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
+
+/* Calculate the rtrmap btree size for some records. */
+unsigned long long
+xfs_rtrmapbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_rtrmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum rmap btree size.
+ */
+static unsigned long long
+xfs_rtrmapbt_max_size(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtblocks)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_rtrmap_mxr[0] == 0)
+ return 0;
+
+ return xfs_rtrmapbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+xfs_filblks_t
+xfs_rtrmapbt_calc_reserves(
+ struct xfs_mount *mp)
+{
+ uint32_t blocks = mp->m_groups[XG_TYPE_RTG].blocks;
+
+ if (!xfs_has_rtrmapbt(mp))
+ return 0;
+
+ /* Reserve 1% of the rtgroup or enough for 1 block per record. */
+ return max_t(xfs_filblks_t, blocks / 100,
+ xfs_rtrmapbt_max_size(mp, blocks));
+}
+
+/* Convert on-disk form of btree root to in-memory form. */
+STATIC void
+xfs_rtrmapbt_from_disk(
+ struct xfs_inode *ip,
+ struct xfs_rtrmap_root *dblock,
+ unsigned int dblocklen,
+ struct xfs_btree_block *rblock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_rmap_key *fkp;
+ __be64 *fpp;
+ struct xfs_rmap_key *tkp;
+ __be64 *tpp;
+ struct xfs_rmap_rec *frp;
+ struct xfs_rmap_rec *trp;
+ unsigned int rblocklen = xfs_rtrmap_broot_space(mp, dblock);
+ unsigned int numrecs;
+ unsigned int maxrecs;
+
+ xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino);
+
+ rblock->bb_level = dblock->bb_level;
+ rblock->bb_numrecs = dblock->bb_numrecs;
+ numrecs = be16_to_cpu(dblock->bb_numrecs);
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+ tkp = xfs_rtrmap_key_addr(rblock, 1);
+ fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+ tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+ trp = xfs_rtrmap_rec_addr(rblock, 1);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Load a realtime reverse mapping btree root in from disk. */
+int
+xfs_iformat_rtrmap(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ struct xfs_btree_block *broot;
+ unsigned int numrecs;
+ unsigned int level;
+ int dsize;
+
+ /*
+ * growfs must create the rtrmap inodes before adding a realtime volume
+ * to the filesystem, so we cannot use the rtrmapbt predicate here.
+ */
+ if (!xfs_has_rmapbt(ip->i_mount)) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+ numrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > mp->m_rtrmap_maxlevels ||
+ xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
+ xfs_rtrmap_broot_space_calc(mp, level, numrecs));
+ if (broot)
+ xfs_rtrmapbt_from_disk(ip, dfp, dsize, broot);
+ return 0;
+}
+
+/* Convert in-memory form of btree root to on-disk form. */
+void
+xfs_rtrmapbt_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *rblock,
+ unsigned int rblocklen,
+ struct xfs_rtrmap_root *dblock,
+ unsigned int dblocklen)
+{
+ struct xfs_rmap_key *fkp;
+ __be64 *fpp;
+ struct xfs_rmap_key *tkp;
+ __be64 *tpp;
+ struct xfs_rmap_rec *frp;
+ struct xfs_rmap_rec *trp;
+ unsigned int numrecs;
+ unsigned int maxrecs;
+
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+ ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+ ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+ ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+ dblock->bb_level = rblock->bb_level;
+ dblock->bb_numrecs = rblock->bb_numrecs;
+ numrecs = be16_to_cpu(rblock->bb_numrecs);
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrmap_key_addr(rblock, 1);
+ tkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+ fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrmap_rec_addr(rblock, 1);
+ trp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Flush a realtime reverse mapping btree root out to disk. */
+void
+xfs_iflush_rtrmap(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+ ASSERT(ifp->if_broot != NULL);
+ ASSERT(ifp->if_broot_bytes > 0);
+ ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <=
+ xfs_inode_fork_size(ip, XFS_DATA_FORK));
+ xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
+ dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
+
+/*
+ * Create a realtime rmap btree inode.
+ */
+int
+xfs_rtrmapbt_create(
+ struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *broot;
+
+ ifp->if_format = XFS_DINODE_FMT_META_BTREE;
+ ASSERT(ifp->if_broot_bytes == 0);
+ ASSERT(ifp->if_bytes == 0);
+
+ /* Initialize the empty incore btree root. */
+ broot = xfs_broot_realloc(ifp, xfs_rtrmap_broot_space_calc(mp, 0, 0));
+ if (broot)
+ xfs_btree_init_block(mp, broot, &xfs_rtrmapbt_ops, 0, 0,
+ ip->i_ino);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+
+ return 0;
+}
+
+/*
+ * Initialize an rmap for a realtime superblock using the potentially updated
+ * rt geometry in the provided @mp.
+ */
+int
+xfs_rtrmapbt_init_rtsb(
+ struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp)
+{
+ struct xfs_rmap_irec rmap = {
+ .rm_blockcount = mp->m_sb.sb_rextsize,
+ .rm_owner = XFS_RMAP_OWN_FS,
+ };
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(xfs_has_rtsb(mp));
+ ASSERT(rtg_rgno(rtg) == 0);
+
+ cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ error = xfs_rmap_map_raw(cur, &rmap);
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
new file mode 100644
index 000000000000..9d0915089891
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTRMAP_BTREE_H__
+#define __XFS_RTRMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+struct xfbtree;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RTRMAP_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrmapbt_init_cursor(struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg);
+struct xfs_btree_cur *xfs_rtrmapbt_stage_cursor(struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xbtree_ifakeroot *ifake);
+void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp);
+unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
+void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrmapbt_droot_maxrecs(unsigned int blocklen, bool leaf);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrmapbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_rec_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_rec *)
+ ((char *)block + XFS_RTRMAP_BLOCK_LEN +
+ (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_key_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_key *)
+ ((char *)block + XFS_RTRMAP_BLOCK_LEN +
+ (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_high_key_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_key *)
+ ((char *)block + XFS_RTRMAP_BLOCK_LEN +
+ sizeof(struct xfs_rmap_key) +
+ (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_ptr_addr(
+ struct xfs_btree_block *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrmap_ptr_t *)
+ ((char *)block + XFS_RTRMAP_BLOCK_LEN +
+ maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+ (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+unsigned int xfs_rtrmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rtrmapbt_init_cur_cache(void);
+void xfs_rtrmapbt_destroy_cur_cache(void);
+
+xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp);
+
+/* Addresses of key, pointers, and records within an ondisk rtrmapbt block. */
+
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_droot_rec_addr(
+ struct xfs_rtrmap_root *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_rec *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_droot_key_addr(
+ struct xfs_rtrmap_root *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_key *)
+ ((char *)(block + 1) +
+ (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_droot_ptr_addr(
+ struct xfs_rtrmap_root *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrmap_ptr_t *)
+ ((char *)(block + 1) +
+ maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+ (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_broot_ptr_addr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *bb,
+ unsigned int index,
+ unsigned int block_size)
+{
+ return xfs_rtrmap_ptr_addr(bb, index,
+ xfs_rtrmapbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrmap_broot_space_calc(
+ struct xfs_mount *mp,
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = XFS_RTRMAP_BLOCK_LEN;
+
+ if (level > 0)
+ return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+ sizeof(xfs_rtrmap_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrmap_broot_space(struct xfs_mount *mp, struct xfs_rtrmap_root *bb)
+{
+ return xfs_rtrmap_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrmap_droot_space_calc(
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = sizeof(struct xfs_rtrmap_root);
+
+ if (level > 0)
+ return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+ sizeof(xfs_rtrmap_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrmap_droot_space(struct xfs_btree_block *bb)
+{
+ return xfs_rtrmap_droot_space_calc(be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock,
+ unsigned int rblocklen, struct xfs_rtrmap_root *dblock,
+ unsigned int dblocklen);
+void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+
+int xfs_rtrmapbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xfs_trans *tp, bool init);
+int xfs_rtrmapbt_init_rtsb(struct xfs_mount *mp, struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp);
+
+unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+
+struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
+
+#endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 3b5623611eba..3dc5f5dba162 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -28,6 +28,8 @@
#include "xfs_rtbitmap.h"
#include "xfs_exchrange.h"
#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1215,11 +1217,23 @@ xfs_sb_mount_common(
mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
+ mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true);
+ mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false);
+ mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2;
+ mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2;
+
mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true);
mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false);
mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
+ mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+ true);
+ mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+ false);
+ mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2;
+ mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2;
+
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index e7efdb9ceaf3..b1e0d9bc1f7d 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -42,6 +42,8 @@ extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
extern const struct xfs_buf_ops xfs_rtbuf_ops;
extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
@@ -55,6 +57,9 @@ extern const struct xfs_btree_ops xfs_bmbt_ops;
extern const struct xfs_btree_ops xfs_refcountbt_ops;
extern const struct xfs_btree_ops xfs_rmapbt_ops;
extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
+extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
+extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops;
+extern const struct xfs_btree_ops xfs_rtrefcountbt_ops;
static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
{
@@ -96,10 +101,26 @@ static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops)
{
return ops == &xfs_rmapbt_mem_ops;
}
+
+static inline bool xfs_btree_is_mem_rtrmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rtrmapbt_mem_ops;
+}
#else
# define xfs_btree_is_mem_rmap(...) (false)
+# define xfs_btree_is_mem_rtrmap(...) (false)
#endif
+static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rtrmapbt_ops;
+}
+
+static inline bool xfs_btree_is_rtrefcount(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rtrefcountbt_ops;
+}
+
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
int xfs_log_calc_minimum_size(struct xfs_mount *);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index bab402340b5d..13d00c7166e1 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -92,6 +92,14 @@ xfs_refcountbt_block_count(
return num_ops * (2 * mp->m_refc_maxlevels - 1);
}
+static unsigned int
+xfs_rtrefcountbt_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ return num_ops * (2 * mp->m_rtrefc_maxlevels - 1);
+}
+
/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
@@ -213,7 +221,9 @@ xfs_calc_inode_chunk_res(
* Per-extent log reservation for the btree changes involved in freeing or
* allocating a realtime extent. We have to be able to log as many rtbitmap
* blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
- * extents, as well as the realtime summary block.
+ * extents, as well as the realtime summary block (t1). Realtime rmap btree
+ * operations happen in a second transaction, so factor in a couple of rtrmapbt
+ * splits (t2).
*/
static unsigned int
xfs_rtalloc_block_count(
@@ -222,10 +232,16 @@ xfs_rtalloc_block_count(
{
unsigned int rtbmp_blocks;
xfs_rtxlen_t rtxlen;
+ unsigned int t1, t2 = 0;
rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
rtbmp_blocks = xfs_rtbitmap_blockcount_len(mp, rtxlen);
- return (rtbmp_blocks + 1) * num_ops;
+ t1 = (rtbmp_blocks + 1) * num_ops;
+
+ if (xfs_has_rmapbt(mp))
+ t2 = num_ops * (2 * mp->m_rtrmap_maxlevels - 1);
+
+ return max(t1, t2);
}
/*
@@ -251,10 +267,13 @@ xfs_rtalloc_block_count(
* Compute the log reservation required to handle the refcount update
* transaction. Refcount updates are always done via deferred log items.
*
- * This is calculated as:
+ * This is calculated as the max of:
* Data device refcount updates (t1):
* the agfs of the ags containing the blocks: nr_ops * sector size
* the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ * Realtime refcount updates (t2);
+ * the rt refcount inode
+ * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
*/
static unsigned int
xfs_calc_refcountbt_reservation(
@@ -262,12 +281,20 @@ xfs_calc_refcountbt_reservation(
unsigned int nr_ops)
{
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+ unsigned int t1, t2 = 0;
if (!xfs_has_reflink(mp))
return 0;
- return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+ t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+
+ if (xfs_has_realtime(mp))
+ t2 = xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
+ blksz);
+
+ return max(t1, t2);
}
/*
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 1155ff2d37e2..d89b570aafcc 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -14,6 +14,19 @@
#define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \
(((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0]))
+/* Worst case number of realtime rmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) \
+ (((mp)->m_rtrmap_mxr[0]) - ((mp)->m_rtrmap_mnr[0]))
+
+/* Adding one realtime rmap could split every level to the top of the tree. */
+#define XFS_RTRMAPADD_SPACE_RES(mp) ((mp)->m_rtrmap_maxlevels)
+
+/* Blocks we might need to add "b" realtime rmaps to a tree. */
+#define XFS_NRTRMAPADD_SPACE_RES(mp, b) \
+ ((((b) + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * \
+ XFS_RTRMAPADD_SPACE_RES(mp))
+
/* Worst case number of rmaps that can be held in a block. */
#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index bf33c2b1e43e..ca2401c1facd 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -202,6 +202,13 @@ enum xfs_ag_resv_type {
* altering fdblocks. If you think you need this you're wrong.
*/
XFS_AG_RESV_IGNORE,
+
+ /*
+ * This allocation activity is being done on behalf of a metadata file.
+ * These files maintain their own permanent space reservations and are
+ * required to adjust fdblocks using the xfs_metafile_resv_* helpers.
+ */
+ XFS_AG_RESV_METAFILE,
};
/* Results of scanning a btree keyspace to check occupancy. */
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index b45d2b32051a..cd6f0223879f 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -647,7 +647,7 @@ xrep_agfl_fill(
xfs_agblock_t agbno = start;
int error;
- trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
+ trace_xrep_agfl_insert(pag_group(sc->sa.pag), agbno, len);
while (agbno < start + len && af->fl_off < af->flcount)
af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 0433363a90b6..bed6a09aa791 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -542,8 +542,9 @@ xrep_abt_dispose_one(
/* Add a deferred rmap for each extent we used. */
if (resv->used > 0)
- xfs_rmap_alloc_extent(sc->tp, pag_agno(pag), resv->agbno,
- resv->used, XFS_RMAP_OWN_AG);
+ xfs_rmap_alloc_extent(sc->tp, false,
+ xfs_agbno_to_fsb(pag, resv->agbno), resv->used,
+ XFS_RMAP_OWN_AG);
/*
* For each reserved btree block we didn't use, add it to the free
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 7e00312225ed..66da7d4d56ba 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -21,6 +21,8 @@
#include "xfs_rmap_btree.h"
#include "xfs_rtgroup.h"
#include "xfs_health.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtrmap_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
@@ -143,15 +145,22 @@ static inline bool
xchk_bmap_get_rmap(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec,
- xfs_agblock_t agbno,
+ xfs_agblock_t bno,
uint64_t owner,
struct xfs_rmap_irec *rmap)
{
+ struct xfs_btree_cur **curp = &info->sc->sa.rmap_cur;
xfs_fileoff_t offset;
unsigned int rflags = 0;
int has_rmap;
int error;
+ if (xfs_ifork_is_realtime(info->sc->ip, info->whichfork))
+ curp = &info->sc->sr.rmap_cur;
+
+ if (*curp == NULL)
+ return false;
+
if (info->whichfork == XFS_ATTR_FORK)
rflags |= XFS_RMAP_ATTR_FORK;
if (irec->br_state == XFS_EXT_UNWRITTEN)
@@ -172,13 +181,13 @@ xchk_bmap_get_rmap(
* range rmap lookup to make sure we get the correct owner/offset.
*/
if (info->is_shared) {
- error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
- owner, offset, rflags, rmap, &has_rmap);
+ error = xfs_rmap_lookup_le_range(*curp, bno, owner, offset,
+ rflags, rmap, &has_rmap);
} else {
- error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
- owner, offset, rflags, rmap, &has_rmap);
+ error = xfs_rmap_lookup_le(*curp, bno, owner, offset,
+ rflags, rmap, &has_rmap);
}
- if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
+ if (!xchk_should_check_xref(info->sc, &error, curp))
return false;
if (!has_rmap)
@@ -192,29 +201,29 @@ STATIC void
xchk_bmap_xref_rmap(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec,
- xfs_agblock_t agbno)
+ xfs_agblock_t bno)
{
struct xfs_rmap_irec rmap;
unsigned long long rmap_end;
uint64_t owner = info->sc->ip->i_ino;
- if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
+ if (xchk_skip_xref(info->sc->sm))
return;
/* Find the rmap record for this irec. */
- if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+ if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap))
return;
/*
* The rmap must be an exact match for this incore file mapping record,
* which may have arisen from multiple ondisk records.
*/
- if (rmap.rm_startblock != agbno)
+ if (rmap.rm_startblock != bno)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
- if (rmap_end != agbno + irec->br_blockcount)
+ if (rmap_end != bno + irec->br_blockcount)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -259,7 +268,7 @@ STATIC void
xchk_bmap_xref_rmap_cow(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec,
- xfs_agblock_t agbno)
+ xfs_agblock_t bno)
{
struct xfs_rmap_irec rmap;
unsigned long long rmap_end;
@@ -269,7 +278,7 @@ xchk_bmap_xref_rmap_cow(
return;
/* Find the rmap record for this irec. */
- if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+ if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap))
return;
/*
@@ -277,12 +286,12 @@ xchk_bmap_xref_rmap_cow(
* can start before and end after the physical space allocated to this
* mapping. There are no offsets to check.
*/
- if (rmap.rm_startblock > agbno)
+ if (rmap.rm_startblock > bno)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
- if (rmap_end < agbno + irec->br_blockcount)
+ if (rmap_end < bno + irec->br_blockcount)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -315,6 +324,8 @@ xchk_bmap_rt_iextent_xref(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
+ struct xfs_owner_info oinfo;
+ xfs_rgblock_t rgbno;
int error;
error = xchk_rtgroup_init_existing(info->sc,
@@ -324,10 +335,46 @@ xchk_bmap_rt_iextent_xref(
irec->br_startoff, &error))
return;
- xchk_rtgroup_lock(&info->sc->sr, XCHK_RTGLOCK_ALL);
+ error = xchk_rtgroup_lock(info->sc, &info->sc->sr, XCHK_RTGLOCK_ALL);
+ if (!xchk_fblock_process_error(info->sc, info->whichfork,
+ irec->br_startoff, &error))
+ goto out_free;
+
xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
irec->br_blockcount);
+ if (!xfs_has_rtrmapbt(info->sc->mp))
+ goto out_cur;
+
+ rgbno = xfs_rtb_to_rgbno(info->sc->mp, irec->br_startblock);
+
+ switch (info->whichfork) {
+ case XFS_DATA_FORK:
+ xchk_bmap_xref_rmap(info, irec, rgbno);
+ if (!xfs_is_reflink_inode(info->sc->ip)) {
+ xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
+ info->whichfork, irec->br_startoff);
+ xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+ irec->br_blockcount, &oinfo);
+ xchk_xref_is_not_rt_shared(info->sc, rgbno,
+ irec->br_blockcount);
+ }
+ xchk_xref_is_not_rt_cow_staging(info->sc, rgbno,
+ irec->br_blockcount);
+ break;
+ case XFS_COW_FORK:
+ xchk_bmap_xref_rmap_cow(info, irec, rgbno);
+ xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+ irec->br_blockcount, &XFS_RMAP_OINFO_COW);
+ xchk_xref_is_rt_cow_staging(info->sc, rgbno,
+ irec->br_blockcount);
+ xchk_xref_is_not_rt_shared(info->sc, rgbno,
+ irec->br_blockcount);
+ break;
+ }
+out_cur:
+ xchk_rtgroup_btcur_free(&info->sc->sr);
+out_free:
xchk_rtgroup_free(info->sc, &info->sc->sr);
}
@@ -614,8 +661,7 @@ xchk_bmap_check_rmap(
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
check_rec.rm_offset);
if (irec.br_startblock !=
- xfs_agbno_to_fsb(to_perag(cur->bc_group),
- check_rec.rm_startblock))
+ xfs_gbno_to_fsb(cur->bc_group, check_rec.rm_startblock))
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
check_rec.rm_offset);
if (irec.br_blockcount > check_rec.rm_blockcount)
@@ -669,6 +715,30 @@ xchk_bmap_check_ag_rmaps(
return error;
}
+/* Make sure each rt rmap has a corresponding bmbt entry. */
+STATIC int
+xchk_bmap_check_rt_rmaps(
+ struct xfs_scrub *sc,
+ struct xfs_rtgroup *rtg)
+{
+ struct xchk_bmap_check_rmap_info sbcri;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
+
+ sbcri.sc = sc;
+ sbcri.whichfork = XFS_DATA_FORK;
+ error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
+ if (error == -ECANCELED)
+ error = 0;
+
+ xfs_btree_del_cursor(cur, error);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ return error;
+}
+
/*
* Decide if we want to scan the reverse mappings to determine if the attr
* fork /really/ has zero space mappings.
@@ -723,10 +793,6 @@ xchk_bmap_check_empty_datafork(
{
struct xfs_ifork *ifp = &ip->i_df;
- /* Don't support realtime rmap checks yet. */
- if (XFS_IS_REALTIME_INODE(ip))
- return false;
-
/*
* If the dinode repair found a bad data fork, it will reset the fork
* to extents format with zero records and wait for the this scrubber
@@ -777,6 +843,21 @@ xchk_bmap_check_rmaps(
struct xfs_perag *pag = NULL;
int error;
+ if (xfs_ifork_is_realtime(sc->ip, whichfork)) {
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
+ error = xchk_bmap_check_rt_rmaps(sc, rtg);
+ if (error ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
+
+ return 0;
+ }
+
while ((pag = xfs_perag_next(sc->mp, pag))) {
error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
if (error ||
@@ -983,6 +1064,7 @@ xchk_bmap(
case XFS_DINODE_FMT_UUID:
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_META_BTREE:
/* No mappings to check. */
if (whichfork == XFS_COW_FORK)
xchk_fblock_set_corrupt(sc, whichfork, 0);
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 7c4955482641..1084213b8e9b 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -25,11 +25,13 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
#include "xfs_refcount.h"
#include "xfs_quota.h"
#include "xfs_ialloc.h"
#include "xfs_ag.h"
#include "xfs_reflink.h"
+#include "xfs_rtgroup.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -99,14 +101,21 @@ xrep_bmap_discover_shared(
xfs_filblks_t blockcount)
{
struct xfs_scrub *sc = rb->sc;
+ struct xfs_btree_cur *cur;
xfs_agblock_t agbno;
xfs_agblock_t fbno;
xfs_extlen_t flen;
int error;
- agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
- error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount,
- &fbno, &flen, false);
+ if (XFS_IS_REALTIME_INODE(sc->ip)) {
+ agbno = xfs_rtb_to_rgbno(sc->mp, startblock);
+ cur = sc->sr.refc_cur;
+ } else {
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
+ cur = sc->sa.refc_cur;
+ }
+ error = xfs_refcount_find_shared(cur, agbno, blockcount, &fbno, &flen,
+ false);
if (error)
return error;
@@ -359,6 +368,114 @@ xrep_bmap_scan_ag(
return error;
}
+#ifdef CONFIG_XFS_RT
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_rtfork_rmap(
+ struct xfs_scrub *sc,
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec)
+{
+ /* xattr extents are never stored on realtime devices */
+ if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+ return -EFSCORRUPTED;
+
+ /* bmbt blocks are never stored on realtime devices */
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ return -EFSCORRUPTED;
+
+ /* Data extents for non-rt files are never stored on the rt device. */
+ if (!XFS_IS_REALTIME_INODE(sc->ip))
+ return -EFSCORRUPTED;
+
+ /* Check the file offsets and physical extents. */
+ if (!xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Check that this is within the rtgroup. */
+ if (!xfs_verify_rgbext(to_rtg(cur->bc_group), rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ return xrep_require_rtext_inuse(sc, rec->rm_startblock,
+ rec->rm_blockcount);
+}
+
+/* Record realtime extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rtrmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_bmap *rb = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ /* Skip extents which are not owned by this inode and fork. */
+ if (rec->rm_owner != rb->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_bmap_check_rtfork_rmap(rb->sc, cur, rec);
+ if (error)
+ return error;
+
+ /*
+ * Record all blocks allocated to this file even if the extent isn't
+ * for the fork we're rebuilding so that we can reset di_nblocks later.
+ */
+ rb->nblocks += rec->rm_blockcount;
+
+ /* If this rmap isn't for the fork we want, we're done. */
+ if (rb->whichfork == XFS_DATA_FORK &&
+ (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+ if (rb->whichfork == XFS_ATTR_FORK &&
+ !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+
+ return xrep_bmap_from_rmap(rb, rec->rm_offset,
+ xfs_rgbno_to_rtb(to_rtg(cur->bc_group),
+ rec->rm_startblock),
+ rec->rm_blockcount,
+ rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/* Scan the realtime reverse mappings to build the new extent map. */
+STATIC int
+xrep_bmap_scan_rtgroup(
+ struct xrep_bmap *rb,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_scrub *sc = rb->sc;
+ int error;
+
+ if (!xfs_has_rtrmapbt(sc->mp))
+ return 0;
+
+ error = xrep_rtgroup_init(sc, rtg, &sc->sr,
+ XFS_RTGLOCK_RMAP |
+ XFS_RTGLOCK_REFCOUNT |
+ XFS_RTGLOCK_BITMAP_SHARED);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb);
+ xchk_rtgroup_btcur_free(&sc->sr);
+ xchk_rtgroup_free(sc, &sc->sr);
+ return error;
+}
+#else
+static inline int
+xrep_bmap_scan_rtgroup(struct xrep_bmap *rb, struct xfs_rtgroup *rtg)
+{
+ return -EFSCORRUPTED;
+}
+#endif
+
/* Find the delalloc extents from the old incore extent tree. */
STATIC int
xrep_bmap_find_delalloc(
@@ -410,6 +527,22 @@ xrep_bmap_find_mappings(
struct xfs_perag *pag = NULL;
int error = 0;
+ /*
+ * Iterate the rtrmaps for extents. Metadata files never have content
+ * on the realtime device, so there's no need to scan them.
+ */
+ if (!xfs_is_metadir_inode(sc->ip)) {
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
+ error = xrep_bmap_scan_rtgroup(rb, rtg);
+ if (error) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
+ }
+
/* Iterate the rmaps for extents. */
while ((pag = xfs_perag_next(sc->mp, pag))) {
error = xrep_bmap_scan_ag(rb, pag);
@@ -731,6 +864,7 @@ xrep_bmap_check_inputs(
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_LOCAL:
case XFS_DINODE_FMT_UUID:
+ case XFS_DINODE_FMT_META_BTREE:
return -ECANCELED;
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -753,10 +887,6 @@ xrep_bmap_check_inputs(
return -EINVAL;
}
- /* Don't know how to rebuild realtime data forks. */
- if (XFS_IS_REALTIME_INODE(sc->ip))
- return -EOPNOTSUPP;
-
return 0;
}
@@ -782,10 +912,6 @@ xrep_bmap_init_reflink_scan(
if (whichfork != XFS_DATA_FORK)
return RLS_IRRELEVANT;
- /* cannot share realtime extents */
- if (XFS_IS_REALTIME_INODE(sc->ip))
- return RLS_IRRELEVANT;
-
return RLS_UNKNOWN;
}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 5cbd94b56582..28ad341df8ee 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -35,6 +35,9 @@
#include "xfs_exchmaps.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_bmap_util.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -719,20 +722,111 @@ xchk_rtgroup_init(
return 0;
}
-void
+/* Lock all the rt group metadata inode ILOCKs and wait for intents. */
+int
xchk_rtgroup_lock(
+ struct xfs_scrub *sc,
struct xchk_rt *sr,
unsigned int rtglock_flags)
{
- xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+ int error = 0;
+
+ ASSERT(sr->rtg != NULL);
+
+ /*
+ * If we're /only/ locking the rtbitmap in shared mode, then we're
+ * obviously not trying to compare records in two metadata inodes.
+ * There's no need to drain intents here because the caller (most
+ * likely the rgsuper scanner) doesn't need that level of consistency.
+ */
+ if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+ sr->rtlock_flags = rtglock_flags;
+ return 0;
+ }
+
+ do {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+
+ /*
+ * If we've grabbed a non-metadata file for scrubbing, we
+ * assume that holding its ILOCK will suffice to coordinate
+ * with any rt intent chains involving this inode.
+ */
+ if (sc->ip && !xfs_is_internal_inode(sc->ip))
+ break;
+
+ /*
+ * Decide if the rt group is quiet enough for all metadata to
+ * be consistent with each other. Regular file IO doesn't get
+ * to lock all the rt inodes at the same time, which means that
+ * there could be other threads in the middle of processing a
+ * chain of deferred ops.
+ *
+ * We just locked all the metadata inodes for this rt group;
+ * now take a look to see if there are any intents in progress.
+ * If there are, drop the rt group inode locks and wait for the
+ * intents to drain. Since we hold the rt group inode locks
+ * for the duration of the scrub, this is the only time we have
+ * to sample the intents counter; any threads increasing it
+ * after this point can't possibly be in the middle of a chain
+ * of rt metadata updates.
+ *
+ * Obviously, this should be slanted against scrub and in favor
+ * of runtime threads.
+ */
+ if (!xfs_group_intent_busy(rtg_group(sr->rtg)))
+ break;
+
+ xfs_rtgroup_unlock(sr->rtg, rtglock_flags);
+
+ if (!(sc->flags & XCHK_FSGATES_DRAIN))
+ return -ECHRNG;
+ error = xfs_group_intent_drain(rtg_group(sr->rtg));
+ if (error) {
+ if (error == -ERESTARTSYS)
+ error = -EINTR;
+ return error;
+ }
+ } while (1);
+
sr->rtlock_flags = rtglock_flags;
+
+ if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP))
+ sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
+
+ if (xfs_has_rtreflink(sc->mp) && (rtglock_flags & XFS_RTGLOCK_REFCOUNT))
+ sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg);
+
+ return 0;
+}
+
+/*
+ * Free all the btree cursors and other incore data relating to the realtime
+ * group. This has to be done /before/ committing (or cancelling) the scrub
+ * transaction.
+ */
+void
+xchk_rtgroup_btcur_free(
+ struct xchk_rt *sr)
+{
+ if (sr->rmap_cur)
+ xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR);
+ if (sr->refc_cur)
+ xfs_btree_del_cursor(sr->refc_cur, XFS_BTREE_ERROR);
+
+ sr->refc_cur = NULL;
+ sr->rmap_cur = NULL;
}
/*
* Unlock the realtime group. This must be done /after/ committing (or
* cancelling) the scrub transaction.
*/
-static void
+void
xchk_rtgroup_unlock(
struct xchk_rt *sr)
{
@@ -812,6 +906,14 @@ xchk_setup_fs(
return xchk_trans_alloc(sc, resblks);
}
+/* Set us up with a transaction and an empty context to repair rt metadata. */
+int
+xchk_setup_rt(
+ struct xfs_scrub *sc)
+{
+ return xchk_trans_alloc(sc, xrep_calc_rtgroup_resblks(sc));
+}
+
/* Set us up with AG headers and btree cursors. */
int
xchk_setup_ag_btree(
@@ -1379,7 +1481,7 @@ xchk_fsgates_enable(
trace_xchk_fsgates_enable(sc, scrub_fsgates);
if (scrub_fsgates & XCHK_FSGATES_DRAIN)
- xfs_drain_wait_enable();
+ xfs_defer_drain_wait_enable();
if (scrub_fsgates & XCHK_FSGATES_QUOTA)
xfs_dqtrx_hook_enable();
@@ -1573,3 +1675,63 @@ xchk_inode_rootdir_inum(const struct xfs_inode *ip)
return mp->m_metadirip->i_ino;
return mp->m_rootip->i_ino;
}
+
+static int
+xchk_meta_btree_count_blocks(
+ struct xfs_scrub *sc,
+ xfs_extnum_t *nextents,
+ xfs_filblks_t *count)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ if (!sc->sr.rtg) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ switch (sc->ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ cur = xfs_rtrefcountbt_init_cursor(sc->tp, sc->sr.rtg);
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_btree_count_blocks(cur, count);
+ xfs_btree_del_cursor(cur, error);
+ if (!error) {
+ *nextents = 0;
+ (*count)--; /* don't count the btree iroot */
+ }
+ return error;
+}
+
+/* Count the blocks used by a file, even if it's a metadata inode. */
+int
+xchk_inode_count_blocks(
+ struct xfs_scrub *sc,
+ int whichfork,
+ xfs_extnum_t *nextents,
+ xfs_filblks_t *count)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+ if (!ifp) {
+ *nextents = 0;
+ *count = 0;
+ return 0;
+ }
+
+ if (ifp->if_format == XFS_DINODE_FMT_META_BTREE) {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ return xchk_meta_btree_count_blocks(sc, nextents, count);
+ }
+
+ return xfs_bmap_count_blocks(sc->tp, sc->ip, whichfork, nextents,
+ count);
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 9ff3cafd8679..19877d99f255 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -63,6 +63,7 @@ static inline int xchk_setup_nothing(struct xfs_scrub *sc)
/* Setup functions */
int xchk_setup_agheader(struct xfs_scrub *sc);
int xchk_setup_fs(struct xfs_scrub *sc);
+int xchk_setup_rt(struct xfs_scrub *sc);
int xchk_setup_ag_allocbt(struct xfs_scrub *sc);
int xchk_setup_ag_iallocbt(struct xfs_scrub *sc);
int xchk_setup_ag_rmapbt(struct xfs_scrub *sc);
@@ -80,10 +81,14 @@ int xchk_setup_metapath(struct xfs_scrub *sc);
int xchk_setup_rtbitmap(struct xfs_scrub *sc);
int xchk_setup_rtsummary(struct xfs_scrub *sc);
int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
+int xchk_setup_rtrmapbt(struct xfs_scrub *sc);
+int xchk_setup_rtrefcountbt(struct xfs_scrub *sc);
#else
# define xchk_setup_rtbitmap xchk_setup_nothing
# define xchk_setup_rtsummary xchk_setup_nothing
# define xchk_setup_rgsuperblock xchk_setup_nothing
+# define xchk_setup_rtrmapbt xchk_setup_nothing
+# define xchk_setup_rtrefcountbt xchk_setup_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -125,7 +130,9 @@ xchk_ag_init_existing(
#ifdef CONFIG_XFS_RT
/* All the locks we need to check an rtgroup. */
-#define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP)
+#define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \
+ XFS_RTGLOCK_RMAP | \
+ XFS_RTGLOCK_REFCOUNT)
int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
struct xchk_rt *sr);
@@ -141,12 +148,17 @@ xchk_rtgroup_init_existing(
return error == -ENOENT ? -EFSCORRUPTED : error;
}
-void xchk_rtgroup_lock(struct xchk_rt *sr, unsigned int rtglock_flags);
+int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr,
+ unsigned int rtglock_flags);
+void xchk_rtgroup_unlock(struct xchk_rt *sr);
+void xchk_rtgroup_btcur_free(struct xchk_rt *sr);
void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
#else
# define xchk_rtgroup_init(sc, rgno, sr) (-EFSCORRUPTED)
# define xchk_rtgroup_init_existing(sc, rgno, sr) (-EFSCORRUPTED)
-# define xchk_rtgroup_lock(sc, lockflags) do { } while (0)
+# define xchk_rtgroup_lock(sc, sr, lockflags) (-EFSCORRUPTED)
+# define xchk_rtgroup_unlock(sr) do { } while (0)
+# define xchk_rtgroup_btcur_free(sr) do { } while (0)
# define xchk_rtgroup_free(sc, sr) do { } while (0)
#endif /* CONFIG_XFS_RT */
@@ -212,7 +224,6 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
bool xchk_dir_looks_zapped(struct xfs_inode *dp);
bool xchk_pptr_looks_zapped(struct xfs_inode *ip);
-#ifdef CONFIG_XFS_ONLINE_REPAIR
/* Decide if a repair is required. */
static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
{
@@ -232,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc)
return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
!(sc->flags & XREP_ALREADY_FIXED);
}
-#else
-# define xchk_needs_repair(sc) (false)
-# define xchk_could_repair(sc) (false)
-#endif /* CONFIG_XFS_ONLINE_REPAIR */
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
@@ -257,6 +264,12 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
(sc)->mp->m_super->s_id, \
(sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \
##__VA_ARGS__)
+#define xchk_xfile_rtgroup_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): rtgroup 0x%x " fmt, \
+ (sc)->mp->m_super->s_id, \
+ (sc)->sa.pag ? \
+ rtg_rgno((sc)->sr.rtg) : (sc)->sm->sm_agno, \
+ ##__VA_ARGS__)
/*
* Setting up a hook to wait for intents to drain is costly -- we have to take
@@ -274,6 +287,8 @@ void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino,
bool *inuse);
+int xchk_inode_count_blocks(struct xfs_scrub *sc, int whichfork,
+ xfs_extnum_t *nextents, xfs_filblks_t *count);
bool xchk_inode_is_dirtree_root(const struct xfs_inode *ip);
bool xchk_inode_is_sb_rooted(const struct xfs_inode *ip);
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 5b6194cef3e5..38a246b8bf11 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -26,6 +26,9 @@
#include "xfs_errortag.h"
#include "xfs_icache.h"
#include "xfs_refcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -34,6 +37,7 @@
#include "scrub/bitmap.h"
#include "scrub/off_bitmap.h"
#include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
#include "scrub/reap.h"
/*
@@ -61,7 +65,10 @@ struct xrep_cow {
struct xoff_bitmap bad_fileoffs;
/* Bitmap of fsblocks that were removed from the CoW fork. */
- struct xfsb_bitmap old_cowfork_fsblocks;
+ union {
+ struct xfsb_bitmap old_cowfork_fsblocks;
+ struct xrtb_bitmap old_cowfork_rtblocks;
+ };
/* CoW fork mappings used to scan for bad CoW staging extents. */
struct xfs_bmbt_irec irec;
@@ -145,8 +152,7 @@ xrep_cow_mark_shared_staging(
xrep_cow_trim_refcount(xc, &rrec, rec);
return xrep_cow_mark_file_range(xc,
- xfs_agbno_to_fsb(to_perag(cur->bc_group),
- rrec.rc_startblock),
+ xfs_gbno_to_fsb(cur->bc_group, rrec.rc_startblock),
rrec.rc_blockcount);
}
@@ -177,9 +183,8 @@ xrep_cow_mark_missing_staging(
if (xc->next_bno >= rrec.rc_startblock)
goto next;
-
error = xrep_cow_mark_file_range(xc,
- xfs_agbno_to_fsb(to_perag(cur->bc_group), xc->next_bno),
+ xfs_gbno_to_fsb(cur->bc_group, xc->next_bno),
rrec.rc_startblock - xc->next_bno);
if (error)
return error;
@@ -222,8 +227,7 @@ xrep_cow_mark_missing_staging_rmap(
}
return xrep_cow_mark_file_range(xc,
- xfs_agbno_to_fsb(to_perag(cur->bc_group), rec_bno),
- rec_len);
+ xfs_gbno_to_fsb(cur->bc_group, rec_bno), rec_len);
}
/*
@@ -311,6 +315,92 @@ out_pag:
}
/*
+ * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
+ * extent and mark the corresponding part of the file range in the bitmap.
+ */
+STATIC int
+xrep_cow_find_bad_rt(
+ struct xrep_cow *xc)
+{
+ struct xfs_refcount_irec rc_low = { 0 };
+ struct xfs_refcount_irec rc_high = { 0 };
+ struct xfs_rmap_irec rm_low = { 0 };
+ struct xfs_rmap_irec rm_high = { 0 };
+ struct xfs_scrub *sc = xc->sc;
+ struct xfs_rtgroup *rtg;
+ int error = 0;
+
+ xc->irec_startbno = xfs_rtb_to_rgbno(sc->mp, xc->irec.br_startblock);
+
+ rtg = xfs_rtgroup_get(sc->mp,
+ xfs_rtb_to_rgno(sc->mp, xc->irec.br_startblock));
+ if (!rtg)
+ return -EFSCORRUPTED;
+
+ error = xrep_rtgroup_init(sc, rtg, &sc->sr,
+ XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
+ if (error)
+ goto out_rtg;
+
+ /* Mark any CoW fork extents that are shared. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
+ error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_shared_staging, xc);
+ if (error)
+ goto out_sr;
+
+ /* Make sure there are CoW staging extents for the whole mapping. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
+ xc->next_bno = xc->irec_startbno;
+ error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_missing_staging, xc);
+ if (error)
+ goto out_sr;
+
+ if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
+ error = xrep_cow_mark_file_range(xc,
+ xfs_rgbno_to_rtb(rtg, xc->next_bno),
+ xc->irec_startbno + xc->irec.br_blockcount -
+ xc->next_bno);
+ if (error)
+ goto out_sr;
+ }
+
+ /* Mark any area has an rmap that isn't a COW staging extent. */
+ rm_low.rm_startblock = xc->irec_startbno;
+ memset(&rm_high, 0xFF, sizeof(rm_high));
+ rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ error = xfs_rmap_query_range(sc->sr.rmap_cur, &rm_low, &rm_high,
+ xrep_cow_mark_missing_staging_rmap, xc);
+ if (error)
+ goto out_sr;
+
+ /*
+ * If userspace is forcing us to rebuild the CoW fork or someone
+ * turned on the debugging knob, replace everything in the
+ * CoW fork and then scan for staging extents in the refcountbt.
+ */
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
+ xc->irec.br_blockcount);
+ if (error)
+ goto out_rtg;
+ }
+
+out_sr:
+ xchk_rtgroup_btcur_free(&sc->sr);
+ xchk_rtgroup_free(sc, &sc->sr);
+out_rtg:
+ xfs_rtgroup_put(rtg);
+ return error;
+}
+
+/*
* Allocate a replacement CoW staging extent of up to the given number of
* blocks, and fill out the mapping.
*/
@@ -343,7 +433,7 @@ xrep_cow_alloc(
if (args.fsbno == NULLFSBLOCK)
return -ENOSPC;
- xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
+ xfs_refcount_alloc_cow_extent(sc->tp, false, args.fsbno, args.len);
repl->fsbno = args.fsbno;
repl->len = args.len;
@@ -351,6 +441,32 @@ xrep_cow_alloc(
}
/*
+ * Allocate a replacement rt CoW staging extent of up to the given number of
+ * blocks, and fill out the mapping.
+ */
+STATIC int
+xrep_cow_alloc_rt(
+ struct xfs_scrub *sc,
+ xfs_extlen_t maxlen,
+ struct xrep_cow_extent *repl)
+{
+ xfs_rtxlen_t maxrtx = xfs_rtb_to_rtx(sc->mp, maxlen);
+ int error;
+
+ error = xfs_trans_reserve_more(sc->tp, 0, maxrtx);
+ if (error)
+ return error;
+
+ error = xfs_rtallocate_rtgs(sc->tp, NULLRTBLOCK, 1, maxrtx, 1, false,
+ false, &repl->fsbno, &repl->len);
+ if (error)
+ return error;
+
+ xfs_refcount_alloc_cow_extent(sc->tp, true, repl->fsbno, repl->len);
+ return 0;
+}
+
+/*
* Look up the current CoW fork mapping so that we only allocate enough to
* replace a single mapping. If we don't find a mapping that covers the start
* of the file range, or we find a delalloc or written extent, something is
@@ -467,7 +583,10 @@ xrep_cow_replace_range(
*/
alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
nextoff - startoff);
- error = xrep_cow_alloc(sc, alloc_len, &repl);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrep_cow_alloc_rt(sc, alloc_len, &repl);
+ else
+ error = xrep_cow_alloc(sc, alloc_len, &repl);
if (error)
return error;
@@ -483,8 +602,12 @@ xrep_cow_replace_range(
return error;
/* Note the old CoW staging extents; we'll reap them all later. */
- error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
- repl.len);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrtb_bitmap_set(&xc->old_cowfork_rtblocks,
+ got.br_startblock, repl.len);
+ else
+ error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks,
+ got.br_startblock, repl.len);
if (error)
return error;
@@ -540,8 +663,16 @@ xrep_bmap_cow(
if (!ifp)
return 0;
- /* realtime files aren't supported yet */
- if (XFS_IS_REALTIME_INODE(sc->ip))
+ /*
+ * Realtime files with large extent sizes are not supported because
+ * we could encounter an CoW mapping that has been partially written
+ * out *and* requires replacement, and there's no solution to that.
+ */
+ if (xfs_inode_has_bigrtalloc(sc->ip))
+ return -EOPNOTSUPP;
+
+ /* Metadata inodes aren't supposed to have data on the rt volume. */
+ if (xfs_is_metadir_inode(sc->ip) && XFS_IS_REALTIME_INODE(sc->ip))
return -EOPNOTSUPP;
/*
@@ -562,7 +693,10 @@ xrep_bmap_cow(
xc->sc = sc;
xoff_bitmap_init(&xc->bad_fileoffs);
- xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ xrtb_bitmap_init(&xc->old_cowfork_rtblocks);
+ else
+ xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
for_each_xfs_iext(ifp, &icur, &xc->irec) {
if (xchk_should_terminate(sc, &error))
@@ -585,7 +719,10 @@ xrep_bmap_cow(
if (xfs_bmap_is_written_extent(&xc->irec))
continue;
- error = xrep_cow_find_bad(xc);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrep_cow_find_bad_rt(xc);
+ else
+ error = xrep_cow_find_bad(xc);
if (error)
goto out_bitmap;
}
@@ -600,13 +737,20 @@ xrep_bmap_cow(
* by the refcount btree, not the inode, so it is correct to treat them
* like inode metadata.
*/
- error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
- &XFS_RMAP_OINFO_COW);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrep_reap_rtblocks(sc, &xc->old_cowfork_rtblocks,
+ &XFS_RMAP_OINFO_COW);
+ else
+ error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
+ &XFS_RMAP_OINFO_COW);
if (error)
goto out_bitmap;
out_bitmap:
- xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ xrtb_bitmap_destroy(&xc->old_cowfork_rtblocks);
+ else
+ xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
xoff_bitmap_destroy(&xc->bad_fileoffs);
kfree(xc);
return error;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index ccc6ca5934ca..3c0f25098b69 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -114,6 +114,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE },
[XFS_SCRUB_TYPE_METAPATH] = { XHG_FS, XFS_SICK_FS_METAPATH },
[XFS_SCRUB_TYPE_RGSUPER] = { XHG_RTGROUP, XFS_SICK_RG_SUPER },
+ [XFS_SCRUB_TYPE_RTRMAPBT] = { XHG_RTGROUP, XFS_SICK_RG_RMAPBT },
+ [XFS_SCRUB_TYPE_RTREFCBT] = { XHG_RTGROUP, XFS_SICK_RG_REFCNTBT },
};
/* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 25ee66e7649d..db6edd5a5fe5 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -260,12 +260,7 @@ xchk_inode_extsize(
xchk_ino_set_warning(sc, ino);
}
-/*
- * Validate di_cowextsize hint.
- *
- * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
- * These functions must be kept in sync with each other.
- */
+/* Validate di_cowextsize hint. */
STATIC void
xchk_inode_cowextsize(
struct xfs_scrub *sc,
@@ -276,12 +271,25 @@ xchk_inode_cowextsize(
uint64_t flags2)
{
xfs_failaddr_t fa;
+ uint32_t value = be32_to_cpu(dip->di_cowextsize);
- fa = xfs_inode_validate_cowextsize(sc->mp,
- be32_to_cpu(dip->di_cowextsize), mode, flags,
- flags2);
+ fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
if (fa)
xchk_ino_set_corrupt(sc, ino);
+
+ /*
+ * XFS allows a sysadmin to change the rt extent size when adding a rt
+ * section to a filesystem after formatting. If there are any
+ * directories with cowextsize and rtinherit set, the hint could become
+ * misaligned with the new rextsize. The verifier doesn't check this,
+ * because we allow rtinherit directories even without an rt device.
+ * Flag this as an administrative warning since we will clean this up
+ * eventually.
+ */
+ if ((flags & XFS_DIFLAG_RTINHERIT) &&
+ (flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ value % sc->mp->m_sb.sb_rextsize > 0)
+ xchk_ino_set_warning(sc, ino);
}
/* Make sure the di_flags make sense for the inode. */
@@ -360,8 +368,9 @@ xchk_inode_flags2(
if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
goto bad;
- /* realtime and reflink make no sense, currently */
- if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
+ /* realtime and reflink don't always go together */
+ if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK) &&
+ !xfs_has_rtreflink(mp))
goto bad;
/* no bigtime iflag without the bigtime feature */
@@ -502,6 +511,10 @@ xchk_dinode(
if (!S_ISREG(mode) && !S_ISDIR(mode))
xchk_ino_set_corrupt(sc, ino);
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (!S_ISREG(mode))
+ xchk_ino_set_corrupt(sc, ino);
+ break;
case XFS_DINODE_FMT_UUID:
default:
xchk_ino_set_corrupt(sc, ino);
@@ -686,15 +699,13 @@ xchk_inode_xref_bmap(
return;
/* Walk all the extents to check nextents/naextents/nblocks. */
- error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
- &nextents, &count);
+ error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (nextents < xfs_dfork_data_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
- error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
- &nextents, &acount);
+ error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, &acount);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (nextents != xfs_dfork_attr_extents(dip))
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 5a58ddd27bd2..13ff1c933cb8 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -38,6 +38,9 @@
#include "xfs_log_priv.h"
#include "xfs_health.h"
#include "xfs_symlink_remote.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -562,8 +565,6 @@ xrep_dinode_flags(
flags2 |= XFS_DIFLAG2_REFLINK;
else
flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
- if (flags & XFS_DIFLAG_REALTIME)
- flags2 &= ~XFS_DIFLAG2_REFLINK;
if (!xfs_has_bigtime(mp))
flags2 &= ~XFS_DIFLAG2_BIGTIME;
if (!xfs_has_large_extent_counts(mp))
@@ -773,17 +774,71 @@ xrep_dinode_count_ag_rmaps(
return error;
}
+/* Count extents and blocks for an inode given an rt rmap. */
+STATIC int
+xrep_dinode_walk_rtrmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ /* We only care about this inode. */
+ if (rec->rm_owner != ri->sc->sm->sm_ino)
+ return 0;
+
+ if (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))
+ return -EFSCORRUPTED;
+
+ ri->rt_blocks += rec->rm_blockcount;
+ ri->rt_extents++;
+ return 0;
+}
+
+/* Count extents and blocks for an inode from all realtime rmap data. */
+STATIC int
+xrep_dinode_count_rtgroup_rmaps(
+ struct xrep_inode *ri,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ error = xrep_rtgroup_init(sc, rtg, &sc->sr, XFS_RTGLOCK_RMAP);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap,
+ ri);
+ xchk_rtgroup_btcur_free(&sc->sr);
+ xchk_rtgroup_free(sc, &sc->sr);
+ return error;
+}
+
/* Count extents and blocks for a given inode from all rmap data. */
STATIC int
xrep_dinode_count_rmaps(
struct xrep_inode *ri)
{
struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
int error;
- if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
+ if (!xfs_has_rmapbt(ri->sc->mp))
return -EOPNOTSUPP;
+ while ((rtg = xfs_rtgroup_next(ri->sc->mp, rtg))) {
+ error = xrep_dinode_count_rtgroup_rmaps(ri, rtg);
+ if (error) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
+
while ((pag = xfs_perag_next(ri->sc->mp, pag))) {
error = xrep_dinode_count_ag_rmaps(ri, pag);
if (error) {
@@ -888,6 +943,85 @@ xrep_dinode_bad_bmbt_fork(
return false;
}
+/* Return true if this rmap-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_rtrmapbt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size)
+{
+ struct xfs_rtrmap_root *dfp;
+ unsigned int nrecs;
+ unsigned int level;
+
+ if (dfork_size < sizeof(struct xfs_rtrmap_root))
+ return true;
+
+ dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > sc->mp->m_rtrmap_maxlevels)
+ return true;
+ if (xfs_rtrmap_droot_space_calc(level, nrecs) > dfork_size)
+ return true;
+ if (level > 0 && nrecs == 0)
+ return true;
+
+ return false;
+}
+
+/* Return true if this refcount-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_rtrefcountbt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size)
+{
+ struct xfs_rtrefcount_root *dfp;
+ unsigned int nrecs;
+ unsigned int level;
+
+ if (dfork_size < sizeof(struct xfs_rtrefcount_root))
+ return true;
+
+ dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > sc->mp->m_rtrefc_maxlevels)
+ return true;
+ if (xfs_rtrefcount_droot_space_calc(level, nrecs) > dfork_size)
+ return true;
+ if (level > 0 && nrecs == 0)
+ return true;
+
+ return false;
+}
+
+/* Check a metadata-btree fork. */
+STATIC bool
+xrep_dinode_bad_metabt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size,
+ int whichfork)
+{
+ if (whichfork != XFS_DATA_FORK)
+ return true;
+
+ switch (be16_to_cpu(dip->di_metatype)) {
+ case XFS_METAFILE_RTRMAP:
+ return xrep_dinode_bad_rtrmapbt_fork(sc, dip, dfork_size);
+ case XFS_METAFILE_RTREFCOUNT:
+ return xrep_dinode_bad_rtrefcountbt_fork(sc, dip, dfork_size);
+ default:
+ return true;
+ }
+
+ return false;
+}
+
/*
* Check the data fork for things that will fail the ifork verifiers or the
* ifork formatters.
@@ -921,9 +1055,17 @@ xrep_dinode_check_dfork(
return true;
break;
case S_IFREG:
- if (fmt == XFS_DINODE_FMT_LOCAL)
+ switch (fmt) {
+ case XFS_DINODE_FMT_LOCAL:
return true;
- fallthrough;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_META_BTREE:
+ break;
+ default:
+ return true;
+ }
+ break;
case S_IFLNK:
case S_IFDIR:
switch (fmt) {
@@ -968,6 +1110,11 @@ xrep_dinode_check_dfork(
XFS_DATA_FORK))
return true;
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (xrep_dinode_bad_metabt_fork(sc, dip, dfork_size,
+ XFS_DATA_FORK))
+ return true;
+ break;
default:
return true;
}
@@ -1088,6 +1235,11 @@ xrep_dinode_check_afork(
XFS_ATTR_FORK))
return true;
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (xrep_dinode_bad_metabt_fork(sc, dip, afork_size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
default:
return true;
}
@@ -1135,6 +1287,8 @@ xrep_dinode_ensure_forkoff(
uint16_t mode)
{
struct xfs_bmdr_block *bmdr;
+ struct xfs_rtrmap_root *rmdr;
+ struct xfs_rtrefcount_root *rcdr;
struct xfs_scrub *sc = ri->sc;
xfs_extnum_t attr_extents, data_extents;
size_t bmdr_minsz = xfs_bmdr_space_calc(1);
@@ -1241,6 +1395,21 @@ xrep_dinode_ensure_forkoff(
bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
dfork_min = xfs_bmap_broot_space(sc->mp, bmdr);
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ switch (be16_to_cpu(dip->di_metatype)) {
+ case XFS_METAFILE_RTRMAP:
+ rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr);
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ rcdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ dfork_min = xfs_rtrefcount_broot_space(sc->mp, rcdr);
+ break;
+ default:
+ dfork_min = 0;
+ break;
+ }
+ break;
default:
dfork_min = 0;
break;
@@ -1500,8 +1669,7 @@ xrep_inode_blockcounts(
trace_xrep_inode_blockcounts(sc);
/* Set data fork counters from the data fork mappings. */
- error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
- &nextents, &count);
+ error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count);
if (error)
return error;
if (xfs_is_reflink_inode(sc->ip)) {
@@ -1525,8 +1693,8 @@ xrep_inode_blockcounts(
/* Set attr fork counters from the attr fork mappings. */
ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
if (ifp) {
- error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
- &nextents, &acount);
+ error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents,
+ &acount);
if (error)
return error;
if (count >= sc->mp->m_sb.sb_dblocks)
@@ -1664,10 +1832,6 @@ xrep_inode_flags(
/* DAX only applies to files and dirs. */
if (!(S_ISREG(mode) || S_ISDIR(mode)))
sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
-
- /* No reflink files on the realtime device. */
- if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
- sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
}
/*
@@ -1783,6 +1947,20 @@ xrep_inode_pptr(
sizeof(struct xfs_attr_sf_hdr), true);
}
+/* Fix COW extent size hint problems. */
+STATIC void
+xrep_inode_cowextsize(
+ struct xfs_scrub *sc)
+{
+ /* Fix misaligned CoW extent size hints on a directory. */
+ if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (sc->ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) {
+ sc->ip->i_cowextsize = 0;
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ }
+}
+
/* Fix any irregularities in an inode that the verifiers don't catch. */
STATIC int
xrep_inode_problems(
@@ -1806,6 +1984,7 @@ xrep_inode_problems(
if (S_ISDIR(VFS_I(sc->ip)->i_mode))
xrep_inode_dir_size(sc);
xrep_inode_extsize(sc);
+ xrep_inode_cowextsize(sc);
trace_xrep_inode_fixed(sc);
xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index c678cba1ffc3..e21c16fbd15d 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -21,6 +21,8 @@
#include "xfs_trans_space.h"
#include "xfs_attr.h"
#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -246,6 +248,10 @@ xchk_setup_metapath(
return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_GROUP);
case XFS_SCRUB_METAPATH_PRJQUOTA:
return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ);
+ case XFS_SCRUB_METAPATH_RTRMAPBT:
+ return xchk_setup_metapath_rtginode(sc, XFS_RTGI_RMAP);
+ case XFS_SCRUB_METAPATH_RTREFCOUNTBT:
+ return xchk_setup_metapath_rtginode(sc, XFS_RTGI_REFCOUNT);
default:
return -ENOENT;
}
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 70af27d98734..ac38f5843090 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -19,6 +19,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
#include "xfs_defer.h"
+#include "xfs_metafile.h"
+#include "xfs_quota.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -121,6 +123,43 @@ xrep_newbt_init_inode(
}
/*
+ * Initialize accounting resources for staging a new metadata inode btree.
+ * If the metadata file has a space reservation, the caller must adjust that
+ * reservation when committing the new ondisk btree.
+ */
+int
+xrep_newbt_init_metadir_inode(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_ifork *ifp;
+
+ ASSERT(xfs_is_metadir_inode(sc->ip));
+
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+
+ ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
+ if (!ifp)
+ return -ENOMEM;
+
+ /*
+ * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the
+ * inode metadata space reservations can only account allocated space
+ * to the i_nblocks. We do not want to change the inode core fields
+ * until we're ready to commit the new tree, so we allocate the blocks
+ * as if they were regular file blocks. This exposes us to a higher
+ * risk of the repair being cancelled due to ENOSPC.
+ */
+ xrep_newbt_init_ag(xnr, sc, &oinfo,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+ XFS_AG_RESV_NONE);
+ xnr->ifake.if_fork = ifp;
+ xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK);
+ return 0;
+}
+
+/*
* Initialize accounting resources for staging a new btree. Callers are
* expected to add their own reservations (and clean them up) manually.
*/
@@ -224,6 +263,7 @@ xrep_newbt_alloc_ag_blocks(
int error = 0;
ASSERT(sc->sa.pag != NULL);
+ ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
while (nr_blocks > 0) {
struct xfs_alloc_arg args = {
@@ -297,6 +337,8 @@ xrep_newbt_alloc_file_blocks(
struct xfs_mount *mp = sc->mp;
int error = 0;
+ ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
+
while (nr_blocks > 0) {
struct xfs_alloc_arg args = {
.tp = sc->tp,
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
index 3d804d31af24..5ce785599287 100644
--- a/fs/xfs/scrub/newbt.h
+++ b/fs/xfs/scrub/newbt.h
@@ -63,6 +63,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
enum xfs_ag_resv_type resv);
int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_init_metadir_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc);
int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag,
xfs_agblock_t agbno, xfs_extlen_t len);
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index c287c755f2c5..3537f3cca6d5 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -167,10 +167,11 @@ xrep_orphanage_create(
* directory to control access to a file we put in here.
*/
if (d_really_is_negative(orphanage_dentry)) {
- error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry,
- 0750);
- if (error)
- goto out_dput_orphanage;
+ orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode,
+ orphanage_dentry, 0750);
+ error = PTR_ERR(orphanage_dentry);
+ if (IS_ERR(orphanage_dentry))
+ goto out_unlock_root;
}
/* Not a directory? Bail out. */
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 183d531875ea..58d6d4ed2853 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -212,12 +212,18 @@ xchk_quota_item(
if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_warning(sc, XFS_DATA_FORK,
offset);
+ if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+ xchk_fblock_set_warning(sc, XFS_DATA_FORK,
+ offset);
} else {
if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
offset);
+ if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ offset);
}
- if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks)
+ if (dq->q_ino.count > fs_icount)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
/*
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index cd51f10f2920..8f4c8d41f308 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -233,7 +233,7 @@ xrep_quota_item(
rqi->need_quotacheck = true;
dirty = true;
}
- if (dq->q_rtb.count > mp->m_sb.sb_rblocks) {
+ if (!xfs_has_reflink(mp) && dq->q_rtb.count > mp->m_sb.sb_rblocks) {
dq->q_rtb.reserved -= dq->q_rtb.count;
dq->q_rtb.reserved += mp->m_sb.sb_rblocks;
dq->q_rtb.count = mp->m_sb.sb_rblocks;
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 08230952053b..b32fb233cf84 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -33,6 +33,9 @@
#include "xfs_attr.h"
#include "xfs_attr_remote.h"
#include "xfs_defer.h"
+#include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -40,6 +43,7 @@
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
#include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
#include "scrub/reap.h"
/*
@@ -310,7 +314,7 @@ xreap_agextent_binval(
}
out:
- trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
+ trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp);
}
/*
@@ -369,7 +373,8 @@ xreap_agextent_select(
out_found:
*aglenp = len;
- trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
+ trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len,
+ *crosslinked);
out_cur:
xfs_btree_del_cursor(cur, error);
return error;
@@ -390,6 +395,8 @@ xreap_agextent_iter(
xfs_fsblock_t fsbno;
int error = 0;
+ ASSERT(rs->resv != XFS_AG_RESV_METAFILE);
+
fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno);
/*
@@ -406,7 +413,8 @@ xreap_agextent_iter(
* to run xfs_repair.
*/
if (crosslinked) {
- trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
+ trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
+ *aglenp);
rs->force_roll = true;
@@ -416,7 +424,8 @@ xreap_agextent_iter(
* records from the refcountbt, which will remove the
* rmap record as well.
*/
- xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
+ *aglenp);
return 0;
}
@@ -424,7 +433,7 @@ xreap_agextent_iter(
*aglenp, rs->oinfo);
}
- trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
+ trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
/*
* Invalidate as many buffers as we can, starting at agbno. If this
@@ -448,7 +457,7 @@ xreap_agextent_iter(
if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
ASSERT(rs->resv == XFS_AG_RESV_NONE);
- xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp);
error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
if (error)
@@ -675,6 +684,263 @@ xrep_reap_fsblocks(
return 0;
}
+#ifdef CONFIG_XFS_RT
+/*
+ * Figure out the longest run of blocks that we can dispose of with a single
+ * call. Cross-linked blocks should have their reverse mappings removed, but
+ * single-owner extents can be freed. Units are rt blocks, not rt extents.
+ */
+STATIC int
+xreap_rgextent_select(
+ struct xreap_state *rs,
+ xfs_rgblock_t rgbno,
+ xfs_rgblock_t rgbno_next,
+ bool *crosslinked,
+ xfs_extlen_t *rglenp)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_btree_cur *cur;
+ xfs_rgblock_t bno = rgbno + 1;
+ xfs_extlen_t len = 1;
+ int error;
+
+ /*
+ * Determine if there are any other rmap records covering the first
+ * block of this extent. If so, the block is crosslinked.
+ */
+ cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
+ error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo,
+ crosslinked);
+ if (error)
+ goto out_cur;
+
+ /*
+ * Figure out how many of the subsequent blocks have the same crosslink
+ * status.
+ */
+ while (bno < rgbno_next) {
+ bool also_crosslinked;
+
+ error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
+ &also_crosslinked);
+ if (error)
+ goto out_cur;
+
+ if (*crosslinked != also_crosslinked)
+ break;
+
+ len++;
+ bno++;
+ }
+
+ *rglenp = len;
+ trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len,
+ *crosslinked);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Dispose of as much of the beginning of this rtgroup extent as possible.
+ * The number of blocks disposed of will be returned in @rglenp.
+ */
+STATIC int
+xreap_rgextent_iter(
+ struct xreap_state *rs,
+ xfs_rgblock_t rgbno,
+ xfs_extlen_t *rglenp,
+ bool crosslinked)
+{
+ struct xfs_scrub *sc = rs->sc;
+ xfs_rtblock_t rtbno;
+ int error;
+
+ /*
+ * The only caller so far is CoW fork repair, so we only know how to
+ * unlink or free CoW staging extents. Here we don't have to worry
+ * about invalidating buffers!
+ */
+ if (rs->oinfo != &XFS_RMAP_OINFO_COW) {
+ ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW);
+ return -EFSCORRUPTED;
+ }
+ ASSERT(rs->resv == XFS_AG_RESV_NONE);
+
+ rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);
+
+ /*
+ * If there are other rmappings, this block is cross linked and must
+ * not be freed. Remove the forward and reverse mapping and move on.
+ */
+ if (crosslinked) {
+ trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno,
+ *rglenp);
+
+ xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+ rs->deferred++;
+ return 0;
+ }
+
+ trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);
+
+ /*
+ * The CoW staging extent is not crosslinked. Use deferred work items
+ * to remove the refcountbt records (which removes the rmap records)
+ * and free the extent. We're not worried about the system going down
+ * here because log recovery walks the refcount btree to clean out the
+ * CoW staging extents.
+ */
+ xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+ error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL,
+ rs->resv,
+ XFS_FREE_EXTENT_REALTIME |
+ XFS_FREE_EXTENT_SKIP_DISCARD);
+ if (error)
+ return error;
+
+ rs->deferred++;
+ return 0;
+}
+
+#define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \
+ XFS_RTGLOCK_RMAP | \
+ XFS_RTGLOCK_REFCOUNT)
+
+/*
+ * Break a rt file metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately. The extent must
+ * be aligned to a realtime extent.
+ */
+STATIC int
+xreap_rtmeta_extent(
+ uint64_t rtbno,
+ uint64_t len,
+ void *priv)
+{
+ struct xreap_state *rs = priv;
+ struct xfs_scrub *sc = rs->sc;
+ xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno);
+ xfs_rgblock_t rgbno_next = rgbno + len;
+ int error = 0;
+
+ ASSERT(sc->ip != NULL);
+ ASSERT(!sc->sr.rtg);
+
+ /*
+ * We're reaping blocks after repairing file metadata, which means that
+ * we have to init the xchk_ag structure ourselves.
+ */
+ sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno));
+ if (!sc->sr.rtg)
+ return -EFSCORRUPTED;
+
+ xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
+
+ while (rgbno < rgbno_next) {
+ xfs_extlen_t rglen;
+ bool crosslinked;
+
+ error = xreap_rgextent_select(rs, rgbno, rgbno_next,
+ &crosslinked, &rglen);
+ if (error)
+ goto out_unlock;
+
+ error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked);
+ if (error)
+ goto out_unlock;
+
+ if (xreap_want_defer_finish(rs)) {
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ goto out_unlock;
+ xreap_defer_finish_reset(rs);
+ } else if (xreap_want_roll(rs)) {
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ if (error)
+ goto out_unlock;
+ xreap_reset(rs);
+ }
+
+ rgbno += rglen;
+ }
+
+out_unlock:
+ xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
+ xfs_rtgroup_put(sc->sr.rtg);
+ sc->sr.rtg = NULL;
+ return error;
+}
+
+/*
+ * Dispose of every block of every rt metadata extent in the bitmap.
+ * Do not use this to dispose of the mappings in an ondisk inode fork.
+ */
+int
+xrep_reap_rtblocks(
+ struct xfs_scrub *sc,
+ struct xrtb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = oinfo,
+ .resv = XFS_AG_RESV_NONE,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip != NULL);
+
+ error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs))
+ return xrep_defer_finish(sc);
+
+ return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Dispose of every block of an old metadata btree that used to be rooted in a
+ * metadata directory file.
+ */
+int
+xrep_reap_metadir_fsblocks(
+ struct xfs_scrub *sc,
+ struct xfsb_bitmap *bitmap)
+{
+ /*
+ * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old
+ * blocks are no longer mapped by the inode, and inode metadata space
+ * reservations can only account freed space to the i_nblocks.
+ */
+ struct xfs_owner_info oinfo;
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = &oinfo,
+ .resv = XFS_AG_RESV_NONE,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip != NULL);
+ ASSERT(xfs_is_metadir_inode(sc->ip));
+
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+
+ error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs))
+ return xrep_defer_finish(sc);
+
+ return 0;
+}
+
/*
* Metadata files are not supposed to share blocks with anything else.
* If blocks are shared, we remove the reverse mapping (thus reducing the
@@ -729,7 +995,8 @@ xreap_bmapi_select(
}
imap->br_blockcount = len;
- trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked);
+ trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len,
+ *crosslinked);
out_cur:
xfs_btree_del_cursor(cur, error);
return error;
@@ -868,7 +1135,8 @@ xreap_bmapi_binval(
}
out:
- trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount);
+ trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno,
+ imap->br_blockcount);
return 0;
}
@@ -895,7 +1163,7 @@ xrep_reap_bmapi_iter(
* anybody else who thinks they own the block, even though that
* runs the risk of stale buffer warnings in the future.
*/
- trace_xreap_dispose_unmap_extent(sc->sa.pag,
+ trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag),
XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
imap->br_blockcount);
@@ -918,7 +1186,7 @@ xrep_reap_bmapi_iter(
* by a block starting before the first block of the extent but overlap
* anyway.
*/
- trace_xreap_dispose_free_extent(sc->sa.pag,
+ trace_xreap_dispose_free_extent(pag_group(sc->sa.pag),
XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
imap->br_blockcount);
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index 3f2f1775e29d..4c8f62701fb3 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -14,6 +14,15 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
const struct xfs_owner_info *oinfo);
int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
+int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc,
+ struct xfsb_bitmap *bitmap);
+
+#ifdef CONFIG_XFS_RT
+int xrep_reap_rtblocks(struct xfs_scrub *sc, struct xrtb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo);
+#else
+# define xrep_reap_rtblocks(...) (-EOPNOTSUPP)
+#endif /* CONFIG_XFS_RT */
/* Buffer cache scan context. */
struct xrep_bufscan {
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 1c5e45cc6419..d46528023015 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -421,7 +421,7 @@ xchk_refcount_mergeable(
if (r1->rc_refcount != r2->rc_refcount)
return false;
if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
- MAXREFCEXTLEN)
+ XFS_REFC_LEN_MAX)
return false;
return true;
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index 4e572b81c986..9c8cb5332da0 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -183,13 +183,13 @@ xrep_refc_stash(
if (xchk_should_terminate(sc, &error))
return error;
- irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
+ irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
error = xrep_refc_check_ext(rr->sc, &irec);
if (error)
return error;
- trace_xrep_refc_found(sc->sa.pag, &irec);
+ trace_xrep_refc_found(pag_group(sc->sa.pag), &irec);
return xfarray_append(rr->refcount_records, &irec);
}
@@ -422,7 +422,7 @@ xrep_refc_find_refcounts(
/*
* Set up a bag to store all the rmap records that we're tracking to
* generate a reference count record. If the size of the bag exceeds
- * MAXREFCOUNT, we clamp rc_refcount.
+ * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
*/
error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
if (error)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 91c8bc055a4f..3b5288d3ef4e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -37,6 +37,12 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_dir2.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
+#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -62,6 +68,7 @@ xrep_attempt(
trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
xchk_ag_btcur_free(&sc->sa);
+ xchk_rtgroup_btcur_free(&sc->sr);
/* Repair whatever's broken. */
ASSERT(sc->ops->repair);
@@ -378,6 +385,41 @@ xrep_calc_ag_resblks(
return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
}
+#ifdef CONFIG_XFS_RT
+/*
+ * Figure out how many blocks to reserve for a rtgroup repair. We calculate
+ * the worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-rtgroup btree.
+ */
+xfs_extlen_t
+xrep_calc_rtgroup_resblks(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_scrub_metadata *sm = sc->sm;
+ uint64_t usedlen;
+ xfs_extlen_t rmapbt_sz = 0;
+
+ if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ return 0;
+ if (!xfs_has_rtgroups(mp)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ usedlen = xfs_rtbxlen_to_blen(mp, xfs_rtgroup_extents(mp, sm->sm_agno));
+ ASSERT(usedlen <= XFS_MAX_RGBLOCKS);
+
+ if (xfs_has_rmapbt(mp))
+ rmapbt_sz = xfs_rtrmapbt_calc_size(mp, usedlen);
+
+ trace_xrep_calc_rtgroup_resblks_btsize(mp, sm->sm_agno, usedlen,
+ rmapbt_sz);
+
+ return rmapbt_sz;
+}
+#endif /* CONFIG_XFS_RT */
+
/*
* Reconstructing per-AG Btrees
*
@@ -954,6 +996,27 @@ xrep_ag_init(
}
#ifdef CONFIG_XFS_RT
+/* Initialize all the btree cursors for a RT repair. */
+void
+xrep_rtgroup_btcur_init(
+ struct xfs_scrub *sc,
+ struct xchk_rt *sr)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ ASSERT(sr->rtg != NULL);
+
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTRMAPBT &&
+ (sr->rtlock_flags & XFS_RTGLOCK_RMAP) &&
+ xfs_has_rtrmapbt(mp))
+ sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
+
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT &&
+ (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) &&
+ xfs_has_rtreflink(mp))
+ sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg);
+}
+
/*
* Given a reference to a rtgroup structure, lock rtgroup btree inodes and
* create btree cursors. Must only be called to repair a regular rt file.
@@ -972,6 +1035,33 @@ xrep_rtgroup_init(
/* Grab our own passive reference from the caller's ref. */
sr->rtg = xfs_rtgroup_hold(rtg);
+ xrep_rtgroup_btcur_init(sc, sr);
+ return 0;
+}
+
+/* Ensure that all rt blocks in the given range are not marked free. */
+int
+xrep_require_rtext_inuse(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t rgbno,
+ xfs_filblks_t len)
+{
+ struct xfs_mount *mp = sc->mp;
+ xfs_rtxnum_t startrtx;
+ xfs_rtxnum_t endrtx;
+ bool is_free = false;
+ int error;
+
+ startrtx = xfs_rgbno_to_rtx(mp, rgbno);
+ endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
+
+ error = xfs_rtalloc_extent_is_free(sc->sr.rtg, sc->tp, startrtx,
+ endrtx - startrtx + 1, &is_free);
+ if (error)
+ return error;
+ if (is_free)
+ return -EFSCORRUPTED;
+
return 0;
}
#endif /* CONFIG_XFS_RT */
@@ -1237,3 +1327,110 @@ xrep_buf_verify_struct(
return fa == NULL;
}
+
+/* Check the sanity of a rmap record for a metadata btree inode. */
+int
+xrep_check_ino_btree_mapping(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ /*
+ * Metadata btree inodes never have extended attributes, and all blocks
+ * should have the bmbt block flag set.
+ */
+ if ((rec->rm_flags & XFS_RMAP_ATTR_FORK) ||
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ return -EFSCORRUPTED;
+
+ /* Make sure the block is within the AG. */
+ if (!xfs_verify_agbext(sc->sa.pag, rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+ rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/*
+ * Reset the block count of the inode being repaired, and adjust the dquot
+ * block usage to match. The inode must not have an xattr fork.
+ */
+void
+xrep_inode_set_nblocks(
+ struct xfs_scrub *sc,
+ int64_t new_blocks)
+{
+ int64_t delta =
+ new_blocks - sc->ip->i_nblocks;
+
+ sc->ip->i_nblocks = new_blocks;
+
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ if (delta != 0)
+ xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT,
+ delta);
+}
+
+/* Reset the block reservation for a metadata inode. */
+int
+xrep_reset_metafile_resv(
+ struct xfs_scrub *sc)
+{
+ struct xfs_inode *ip = sc->ip;
+ int64_t delta;
+ int error;
+
+ delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
+ if (delta == 0)
+ return 0;
+
+ /*
+ * Too many blocks have been reserved, transfer some from the incore
+ * reservation back to the filesystem.
+ */
+ if (delta > 0) {
+ int64_t give_back;
+
+ give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
+ if (give_back > 0) {
+ xfs_mod_delalloc(ip, 0, -give_back);
+ xfs_add_fdblocks(ip->i_mount, give_back);
+ ip->i_delayed_blks -= give_back;
+ }
+
+ return 0;
+ }
+
+ /*
+ * Not enough reservation; try to take some blocks from the filesystem
+ * to the metadata inode. @delta is negative here, so invert the sign.
+ */
+ delta = -delta;
+ error = xfs_dec_fdblocks(sc->mp, delta, true);
+ while (error == -ENOSPC) {
+ delta--;
+ if (delta == 0) {
+ xfs_warn(sc->mp,
+"Insufficient free space to reset space reservation for inode 0x%llx after repair.",
+ ip->i_ino);
+ return 0;
+ }
+ error = xfs_dec_fdblocks(sc->mp, delta, true);
+ }
+ if (error)
+ return error;
+
+ xfs_mod_delalloc(ip, 0, delta);
+ ip->i_delayed_blks += delta;
+ return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index b649da1a93eb..af0a3a9e5ed9 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -50,7 +50,9 @@ xrep_trans_commit(
struct xbitmap;
struct xagb_bitmap;
+struct xrgb_bitmap;
struct xfsb_bitmap;
+struct xrtb_bitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
@@ -97,6 +99,8 @@ int xrep_setup_parent(struct xfs_scrub *sc);
int xrep_setup_nlinks(struct xfs_scrub *sc);
int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks);
int xrep_setup_dirtree(struct xfs_scrub *sc);
+int xrep_setup_rtrmapbt(struct xfs_scrub *sc);
+int xrep_setup_rtrefcountbt(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -110,10 +114,18 @@ int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
#ifdef CONFIG_XFS_RT
int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
struct xchk_rt *sr, unsigned int rtglock_flags);
+void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr);
+int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_filblks_t len);
+xfs_extlen_t xrep_calc_rtgroup_resblks(struct xfs_scrub *sc);
#else
# define xrep_rtgroup_init(sc, rtg, sr, lockflags) (-ENOSYS)
+# define xrep_calc_rtgroup_resblks(sc) (0)
#endif /* CONFIG_XFS_RT */
+int xrep_check_ino_btree_mapping(struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec);
+
/* Metadata revalidators */
int xrep_revalidate_allocbt(struct xfs_scrub *sc);
@@ -147,10 +159,14 @@ int xrep_metapath(struct xfs_scrub *sc);
int xrep_rtbitmap(struct xfs_scrub *sc);
int xrep_rtsummary(struct xfs_scrub *sc);
int xrep_rgsuperblock(struct xfs_scrub *sc);
+int xrep_rtrmapbt(struct xfs_scrub *sc);
+int xrep_rtrefcountbt(struct xfs_scrub *sc);
#else
# define xrep_rtbitmap xrep_notsupported
# define xrep_rtsummary xrep_notsupported
# define xrep_rgsuperblock xrep_notsupported
+# define xrep_rtrmapbt xrep_notsupported
+# define xrep_rtrefcountbt xrep_notsupported
#endif /* CONFIG_XFS_RT */
#ifdef CONFIG_XFS_QUOTA
@@ -169,11 +185,22 @@ int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks);
+int xrep_reset_metafile_resv(struct xfs_scrub *sc);
#else
#define xrep_ino_dqattach(sc) (0)
-#define xrep_will_attempt(sc) (false)
+
+/*
+ * When online repair is not built into the kernel, we still want to attempt
+ * the repair so that the stub xrep_attempt below will return EOPNOTSUPP.
+ */
+static inline bool xrep_will_attempt(const struct xfs_scrub *sc)
+{
+ return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ xchk_needs_repair(sc->sm);
+}
static inline int
xrep_attempt(
@@ -192,6 +219,8 @@ xrep_calc_ag_resblks(
return 0;
}
+#define xrep_calc_rtgroup_resblks xrep_calc_ag_resblks
+
static inline int
xrep_reset_perag_resv(
struct xfs_scrub *sc)
@@ -219,6 +248,8 @@ xrep_setup_nothing(
#define xrep_setup_nlinks xrep_setup_nothing
#define xrep_setup_dirtree xrep_setup_nothing
#define xrep_setup_metapath xrep_setup_nothing
+#define xrep_setup_rtrmapbt xrep_setup_nothing
+#define xrep_setup_rtrefcountbt xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
@@ -256,6 +287,8 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
#define xrep_dirtree xrep_notsupported
#define xrep_metapath xrep_notsupported
#define xrep_rgsuperblock xrep_notsupported
+#define xrep_rtrmapbt xrep_notsupported
+#define xrep_rtrefcountbt xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rgb_bitmap.h b/fs/xfs/scrub/rgb_bitmap.h
new file mode 100644
index 000000000000..4c3126b66dcb
--- /dev/null
+++ b/fs/xfs/scrub/rgb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RGB_BITMAP_H__
+#define __XFS_SCRUB_RGB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rgblock_t */
+
+struct xrgb_bitmap {
+ struct xbitmap32 rgbitmap;
+};
+
+static inline void xrgb_bitmap_init(struct xrgb_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->rgbitmap);
+}
+
+static inline void xrgb_bitmap_destroy(struct xrgb_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->rgbitmap);
+}
+
+static inline int xrgb_bitmap_set(struct xrgb_bitmap *bitmap,
+ xfs_rgblock_t start, xfs_extlen_t len)
+{
+ return xbitmap32_set(&bitmap->rgbitmap, start, len);
+}
+
+static inline int xrgb_bitmap_walk(struct xrgb_bitmap *bitmap,
+ xbitmap32_walk_fn fn, void *priv)
+{
+ return xbitmap32_walk(&bitmap->rgbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_RGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/rgsuper.c b/fs/xfs/scrub/rgsuper.c
index 463b3573bb76..d189732d0e24 100644
--- a/fs/xfs/scrub/rgsuper.c
+++ b/fs/xfs/scrub/rgsuper.c
@@ -13,6 +13,7 @@
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
+#include "xfs_rmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
@@ -34,6 +35,7 @@ xchk_rgsuperblock_xref(
return;
xchk_xref_is_used_rt_space(sc, xfs_rgbno_to_rtb(sc->sr.rtg, 0), 1);
+ xchk_xref_is_only_rt_owned_by(sc, 0, 1, &XFS_RMAP_OINFO_FS);
}
int
@@ -61,7 +63,9 @@ xchk_rgsuperblock(
if (!xchk_xref_process_error(sc, 0, 0, &error))
return error;
- xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+ error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+ if (error)
+ return error;
/*
* Since we already validated the rt superblock at mount time, we don't
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index a0a227d183d2..f5f73078ffe2 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -31,6 +31,9 @@
#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_ag.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -499,6 +502,69 @@ xrep_rmap_scan_iext(
return xrep_rmap_stash_accumulated(rf);
}
+static int
+xrep_rmap_scan_meta_btree(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = rf->rr->sc;
+ struct xfs_rtgroup *rtg = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ enum xfs_rtg_inodes type;
+ int error;
+
+ if (rf->whichfork != XFS_DATA_FORK)
+ return -EFSCORRUPTED;
+
+ switch (ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ type = XFS_RTGI_RMAP;
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ type = XFS_RTGI_REFCOUNT;
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
+ if (ip == rtg->rtg_inodes[type])
+ goto found;
+ }
+
+ /*
+ * We should never find an rt metadata btree inode that isn't
+ * associated with an rtgroup yet has ondisk blocks allocated to it.
+ */
+ if (ip->i_nblocks) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+
+found:
+ switch (ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out_rtg;
+ }
+
+ error = xrep_rmap_scan_iroot_btree(rf, cur);
+ xfs_btree_del_cursor(cur, error);
+out_rtg:
+ xfs_rtgroup_rele(rtg);
+ return error;
+}
+
/* Find all the extents from a given AG in an inode fork. */
STATIC int
xrep_rmap_scan_ifork(
@@ -512,14 +578,14 @@ xrep_rmap_scan_ifork(
.whichfork = whichfork,
};
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ bool mappings_done;
int error = 0;
if (!ifp)
return 0;
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
- bool mappings_done;
-
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_BTREE:
/*
* Scan the bmap btree for data device mappings. This includes
* the btree blocks themselves, even if this is a realtime
@@ -528,15 +594,18 @@ xrep_rmap_scan_ifork(
error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
if (error || mappings_done)
return error;
- } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
- return 0;
+ fallthrough;
+ case XFS_DINODE_FMT_EXTENTS:
+ /* Scan incore extent cache if this isn't a realtime file. */
+ if (xfs_ifork_is_realtime(ip, whichfork))
+ return 0;
+
+ return xrep_rmap_scan_iext(&rf, ifp);
+ case XFS_DINODE_FMT_META_BTREE:
+ return xrep_rmap_scan_meta_btree(&rf, ip);
}
- /* Scan incore extent cache if this isn't a realtime file. */
- if (xfs_ifork_is_realtime(ip, whichfork))
- return 0;
-
- return xrep_rmap_scan_iext(&rf, ifp);
+ return 0;
}
/*
@@ -1552,7 +1621,7 @@ xrep_rmapbt_live_update(
if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
goto out_unlock;
- trace_xrep_rmap_live_update(rr->sc->sa.pag, action, p);
+ trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p);
error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
if (error)
diff --git a/fs/xfs/scrub/rtb_bitmap.h b/fs/xfs/scrub/rtb_bitmap.h
new file mode 100644
index 000000000000..1313ef605511
--- /dev/null
+++ b/fs/xfs/scrub/rtb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTB_BITMAP_H__
+#define __XFS_SCRUB_RTB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rtblock_t */
+
+struct xrtb_bitmap {
+ struct xbitmap64 rtbitmap;
+};
+
+static inline void xrtb_bitmap_init(struct xrtb_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->rtbitmap);
+}
+
+static inline void xrtb_bitmap_destroy(struct xrtb_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->rtbitmap);
+}
+
+static inline int xrtb_bitmap_set(struct xrtb_bitmap *bitmap,
+ xfs_rtblock_t start, xfs_filblks_t len)
+{
+ return xbitmap64_set(&bitmap->rtbitmap, start, len);
+}
+
+static inline int xrtb_bitmap_walk(struct xrtb_bitmap *bitmap,
+ xbitmap64_walk_fn fn, void *priv)
+{
+ return xbitmap64_walk(&bitmap->rtbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_RTB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 376a36fd9a9c..e8c776a34c1d 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -9,17 +9,24 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_btree.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bit.h"
+#include "xfs_rtgroup.h"
#include "xfs_sb.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_exchmaps.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
+#include "scrub/tempexch.h"
#include "scrub/rtbitmap.h"
+#include "scrub/btree.h"
/* Set us up with the realtime metadata locked. */
int
@@ -30,10 +37,15 @@ xchk_setup_rtbitmap(
struct xchk_rtbitmap *rtb;
int error;
- rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ rtb = kzalloc(struct_size(rtb, words, xchk_rtbitmap_wordcnt(sc)),
+ XCHK_GFP_FLAGS);
if (!rtb)
return -ENOMEM;
sc->buf = rtb;
+ rtb->sc = sc;
error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
if (error)
@@ -49,8 +61,7 @@ xchk_setup_rtbitmap(
if (error)
return error;
- error = xchk_install_live_inode(sc,
- sc->sr.rtg->rtg_inodes[XFS_RTGI_BITMAP]);
+ error = xchk_install_live_inode(sc, rtg_bitmap(sc->sr.rtg));
if (error)
return error;
@@ -58,12 +69,15 @@ xchk_setup_rtbitmap(
if (error)
return error;
+ error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+ if (error)
+ return error;
+
/*
* Now that we've locked the rtbitmap, we can't race with growfsrt
* trying to expand the bitmap or change the size of the rt volume.
* Hence it is safe to compute and check the geometry values.
*/
- xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
if (mp->m_sb.sb_rblocks) {
rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
@@ -73,7 +87,32 @@ xchk_setup_rtbitmap(
return 0;
}
-/* Realtime bitmap. */
+/* Per-rtgroup bitmap contents. */
+
+/* Cross-reference rtbitmap entries with other metadata. */
+STATIC void
+xchk_rtbitmap_xref(
+ struct xchk_rtbitmap *rtb,
+ xfs_rtblock_t startblock,
+ xfs_rtblock_t blockcount)
+{
+ struct xfs_scrub *sc = rtb->sc;
+ xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, startblock);
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+ if (!sc->sr.rmap_cur)
+ return;
+
+ xchk_xref_has_no_rt_owner(sc, rgbno, blockcount);
+ xchk_xref_is_not_rt_shared(sc, rgbno, blockcount);
+ xchk_xref_is_not_rt_cow_staging(sc, rgbno, blockcount);
+
+ if (rtb->next_free_rgbno < rgbno)
+ xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno,
+ rgbno - rtb->next_free_rgbno);
+ rtb->next_free_rgbno = rgbno + blockcount;
+}
/* Scrub a free extent record from the realtime bitmap. */
STATIC int
@@ -83,7 +122,8 @@ xchk_rtbitmap_rec(
const struct xfs_rtalloc_rec *rec,
void *priv)
{
- struct xfs_scrub *sc = priv;
+ struct xchk_rtbitmap *rtb = priv;
+ struct xfs_scrub *sc = rtb->sc;
xfs_rtblock_t startblock;
xfs_filblks_t blockcount;
@@ -92,6 +132,12 @@ xchk_rtbitmap_rec(
if (!xfs_verify_rtbext(rtg_mount(rtg), startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+
+ xchk_rtbitmap_xref(rtb, startblock, blockcount);
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -ECANCELED;
+
return 0;
}
@@ -139,15 +185,16 @@ xchk_rtbitmap_check_extents(
return error;
}
-/* Scrub the realtime bitmap. */
+/* Scrub this group's realtime bitmap. */
int
xchk_rtbitmap(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
struct xfs_rtgroup *rtg = sc->sr.rtg;
- struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
struct xchk_rtbitmap *rtb = sc->buf;
+ xfs_rgblock_t last_rgbno;
int error;
/* Is sb_rextents correct? */
@@ -200,10 +247,20 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, sc);
+ rtb->next_free_rgbno = 0;
+ error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, rtb);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
+ /*
+ * Check that the are rmappings for all rt extents between the end of
+ * the last free extent we saw and the last possible extent in the rt
+ * group.
+ */
+ last_rgbno = rtg->rtg_extents * mp->m_sb.sb_rextsize - 1;
+ if (rtb->next_free_rgbno < last_rgbno)
+ xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno,
+ last_rgbno - rtb->next_free_rgbno);
return 0;
}
@@ -215,7 +272,7 @@ xchk_xref_is_used_rt_space(
xfs_extlen_t len)
{
struct xfs_rtgroup *rtg = sc->sr.rtg;
- struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
xfs_rtxnum_t startext;
xfs_rtxnum_t endext;
bool is_free;
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
index 85304ff019e1..fe52b877253d 100644
--- a/fs/xfs/scrub/rtbitmap.h
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -6,17 +6,72 @@
#ifndef __XFS_SCRUB_RTBITMAP_H__
#define __XFS_SCRUB_RTBITMAP_H__
+/*
+ * We use an xfile to construct new bitmap blocks for the portion of the
+ * rtbitmap file that we're replacing. Whereas the ondisk bitmap must be
+ * accessed through the buffer cache, the xfile bitmap supports direct
+ * word-level accesses. Therefore, we create a small abstraction for linear
+ * access.
+ */
+typedef unsigned long long xrep_wordoff_t;
+typedef unsigned int xrep_wordcnt_t;
+
+/* Mask to round an rtx down to the nearest bitmap word. */
+#define XREP_RTBMP_WORDMASK ((1ULL << XFS_NBWORDLOG) - 1)
+
+
struct xchk_rtbitmap {
+ struct xfs_scrub *sc;
+
uint64_t rextents;
uint64_t rbmblocks;
unsigned int rextslog;
unsigned int resblks;
+
+ /* The next free rt group block number that we expect to see. */
+ xfs_rgblock_t next_free_rgbno;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /* stuff for staging a new bitmap */
+ struct xfs_rtalloc_args args;
+ struct xrep_tempexch tempexch;
+#endif
+
+ /* The next rtgroup block we expect to see during our rtrmapbt walk. */
+ xfs_rgblock_t next_rgbno;
+
+ /* rtgroup lock flags */
+ unsigned int rtglock_flags;
+
+ /* rtword position of xfile as we write buffers to disk. */
+ xrep_wordoff_t prep_wordoff;
+
+ /* In-Memory rtbitmap for repair. */
+ union xfs_rtword_raw words[];
};
#ifdef CONFIG_XFS_ONLINE_REPAIR
int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb);
+
+/*
+ * How big should the words[] buffer be?
+ *
+ * For repairs, we want a full fsblock worth of space so that we can memcpy a
+ * buffer full of 1s into the xfile bitmap. The xfile bitmap doesn't have
+ * rtbitmap block headers, so we don't use blockwsize. Scrub doesn't use the
+ * words buffer at all.
+ */
+static inline unsigned int
+xchk_rtbitmap_wordcnt(
+ struct xfs_scrub *sc)
+{
+ if (xchk_could_repair(sc))
+ return sc->mp->m_sb.sb_blocksize >> XFS_WORDLOG;
+ return 0;
+}
#else
# define xrep_setup_rtbitmap(sc, rtb) (0)
+# define xchk_rtbitmap_wordcnt(sc) (0)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
#endif /* __XFS_SCRUB_RTBITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index 0fef98e9f834..203a1a97c502 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -12,32 +12,66 @@
#include "xfs_btree.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
#include "xfs_inode.h"
#include "xfs_bit.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_extent_busy.h"
+#include "xfs_refcount.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/xfile.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/reap.h"
#include "scrub/rtbitmap.h"
-/* Set up to repair the realtime bitmap file metadata. */
+/* rt bitmap content repairs */
+
+/* Set up to repair the realtime bitmap for this group. */
int
xrep_setup_rtbitmap(
struct xfs_scrub *sc,
struct xchk_rtbitmap *rtb)
{
struct xfs_mount *mp = sc->mp;
- unsigned long long blocks = 0;
+ char *descr;
+ unsigned long long blocks = mp->m_sb.sb_rbmblocks;
+ int error;
+
+ error = xrep_tempfile_create(sc, S_IFREG);
+ if (error)
+ return error;
+
+ /* Create an xfile to hold our reconstructed bitmap. */
+ descr = xchk_xfile_rtgroup_descr(sc, "bitmap file");
+ error = xfile_create(descr, blocks * mp->m_sb.sb_blocksize, &sc->xfile);
+ kfree(descr);
+ if (error)
+ return error;
/*
- * Reserve enough blocks to write out a completely new bmbt for a
- * maximally fragmented bitmap file. We do not hold the rtbitmap
- * ILOCK yet, so this is entirely speculative.
+ * Reserve enough blocks to write out a completely new bitmap file,
+ * plus twice as many blocks as we would need if we can only allocate
+ * one block per data fork mapping. This should cover the
+ * preallocation of the temporary file and exchanging the extent
+ * mappings.
+ *
+ * We cannot use xfs_exchmaps_estimate because we have not yet
+ * constructed the replacement bitmap and therefore do not know how
+ * many extents it will use. By the time we do, we will have a dirty
+ * transaction (which we cannot drop because we cannot drop the
+ * rtbitmap ILOCK) and cannot ask for more reservation.
*/
- blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks);
+ blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
if (blocks > UINT_MAX)
return -EOPNOTSUPP;
@@ -45,6 +79,325 @@ xrep_setup_rtbitmap(
return 0;
}
+static inline xrep_wordoff_t
+rtx_to_wordoff(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ return rtx >> XFS_NBWORDLOG;
+}
+
+static inline xrep_wordcnt_t
+rtxlen_to_wordcnt(
+ xfs_rtxlen_t rtxlen)
+{
+ return rtxlen >> XFS_NBWORDLOG;
+}
+
+/* Helper functions to record rtwords in an xfile. */
+
+static inline int
+xfbmp_load(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ xfs_rtword_t *word)
+{
+ union xfs_rtword_raw urk;
+ int error;
+
+ ASSERT(xfs_has_rtgroups(rtb->sc->mp));
+
+ error = xfile_load(rtb->sc->xfile, &urk,
+ sizeof(union xfs_rtword_raw),
+ wordoff << XFS_WORDLOG);
+ if (error)
+ return error;
+
+ *word = be32_to_cpu(urk.rtg);
+ return 0;
+}
+
+static inline int
+xfbmp_store(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ const xfs_rtword_t word)
+{
+ union xfs_rtword_raw urk;
+
+ ASSERT(xfs_has_rtgroups(rtb->sc->mp));
+
+ urk.rtg = cpu_to_be32(word);
+ return xfile_store(rtb->sc->xfile, &urk,
+ sizeof(union xfs_rtword_raw),
+ wordoff << XFS_WORDLOG);
+}
+
+static inline int
+xfbmp_copyin(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ const union xfs_rtword_raw *word,
+ xrep_wordcnt_t nr_words)
+{
+ return xfile_store(rtb->sc->xfile, word, nr_words << XFS_WORDLOG,
+ wordoff << XFS_WORDLOG);
+}
+
+static inline int
+xfbmp_copyout(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ union xfs_rtword_raw *word,
+ xrep_wordcnt_t nr_words)
+{
+ return xfile_load(rtb->sc->xfile, word, nr_words << XFS_WORDLOG,
+ wordoff << XFS_WORDLOG);
+}
+
+/* Perform a logical OR operation on an rtword in the incore bitmap. */
+static int
+xrep_rtbitmap_or(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ xfs_rtword_t mask)
+{
+ xfs_rtword_t word;
+ int error;
+
+ error = xfbmp_load(rtb, wordoff, &word);
+ if (error)
+ return error;
+
+ trace_xrep_rtbitmap_or(rtb->sc->mp, wordoff, mask, word);
+
+ return xfbmp_store(rtb, wordoff, word | mask);
+}
+
+/*
+ * Mark as free every rt extent between the next rt block we expected to see
+ * in the rtrmap records and the given rt block.
+ */
+STATIC int
+xrep_rtbitmap_mark_free(
+ struct xchk_rtbitmap *rtb,
+ xfs_rgblock_t rgbno)
+{
+ struct xfs_mount *mp = rtb->sc->mp;
+ struct xchk_rt *sr = &rtb->sc->sr;
+ struct xfs_rtgroup *rtg = sr->rtg;
+ xfs_rtxnum_t startrtx;
+ xfs_rtxnum_t nextrtx;
+ xrep_wordoff_t wordoff, nextwordoff;
+ unsigned int bit;
+ unsigned int bufwsize;
+ xfs_extlen_t mod;
+ xfs_rtword_t mask;
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!xfs_verify_rgbext(rtg, rtb->next_rgbno, rgbno - rtb->next_rgbno))
+ return -EFSCORRUPTED;
+
+ /*
+ * Convert rt blocks to rt extents The block range we find must be
+ * aligned to an rtextent boundary on both ends.
+ */
+ startrtx = xfs_rgbno_to_rtx(mp, rtb->next_rgbno);
+ mod = xfs_rgbno_to_rtxoff(mp, rtb->next_rgbno);
+ if (mod)
+ return -EFSCORRUPTED;
+
+ nextrtx = xfs_rgbno_to_rtx(mp, rgbno - 1) + 1;
+ mod = xfs_rgbno_to_rtxoff(mp, rgbno - 1);
+ if (mod != mp->m_sb.sb_rextsize - 1)
+ return -EFSCORRUPTED;
+
+ /* Must not be shared or CoW staging. */
+ if (sr->refc_cur) {
+ error = xfs_refcount_has_records(sr->refc_cur,
+ XFS_REFC_DOMAIN_SHARED, rtb->next_rgbno,
+ rgbno - rtb->next_rgbno, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ error = xfs_refcount_has_records(sr->refc_cur,
+ XFS_REFC_DOMAIN_COW, rtb->next_rgbno,
+ rgbno - rtb->next_rgbno, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+ }
+
+ trace_xrep_rtbitmap_record_free(mp, startrtx, nextrtx - 1);
+
+ /* Set bits as needed to round startrtx up to the nearest word. */
+ bit = startrtx & XREP_RTBMP_WORDMASK;
+ if (bit) {
+ xfs_rtblock_t len = nextrtx - startrtx;
+ unsigned int lastbit;
+
+ lastbit = min(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+
+ error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, startrtx),
+ mask);
+ if (error || lastbit - bit == len)
+ return error;
+ startrtx += XFS_NBWORD - bit;
+ }
+
+ /* Set bits as needed to round nextrtx down to the nearest word. */
+ bit = nextrtx & XREP_RTBMP_WORDMASK;
+ if (bit) {
+ mask = ((xfs_rtword_t)1 << bit) - 1;
+
+ error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, nextrtx),
+ mask);
+ if (error || startrtx + bit == nextrtx)
+ return error;
+ nextrtx -= bit;
+ }
+
+ trace_xrep_rtbitmap_record_free_bulk(mp, startrtx, nextrtx - 1);
+
+ /* Set all the words in between, up to a whole fs block at once. */
+ wordoff = rtx_to_wordoff(mp, startrtx);
+ nextwordoff = rtx_to_wordoff(mp, nextrtx);
+ bufwsize = mp->m_sb.sb_blocksize >> XFS_WORDLOG;
+
+ while (wordoff < nextwordoff) {
+ xrep_wordoff_t rem;
+ xrep_wordcnt_t wordcnt;
+
+ wordcnt = min_t(xrep_wordcnt_t, nextwordoff - wordoff,
+ bufwsize);
+
+ /*
+ * Try to keep us aligned to the rtwords buffer to reduce the
+ * number of xfile writes.
+ */
+ rem = wordoff & (bufwsize - 1);
+ if (rem)
+ wordcnt = min_t(xrep_wordcnt_t, wordcnt,
+ bufwsize - rem);
+
+ error = xfbmp_copyin(rtb, wordoff, rtb->words, wordcnt);
+ if (error)
+ return error;
+
+ wordoff += wordcnt;
+ }
+
+ return 0;
+}
+
+/* Set free space in the rtbitmap based on rtrmapbt records. */
+STATIC int
+xrep_rtbitmap_walk_rtrmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xchk_rtbitmap *rtb = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(rtb->sc, &error))
+ return error;
+
+ if (rtb->next_rgbno < rec->rm_startblock) {
+ error = xrep_rtbitmap_mark_free(rtb, rec->rm_startblock);
+ if (error)
+ return error;
+ }
+
+ rtb->next_rgbno = max(rtb->next_rgbno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/*
+ * Walk the rtrmapbt to find all the gaps between records, and mark the gaps
+ * in the realtime bitmap that we're computing.
+ */
+STATIC int
+xrep_rtbitmap_find_freespace(
+ struct xchk_rtbitmap *rtb)
+{
+ struct xfs_scrub *sc = rtb->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ uint64_t blockcount;
+ int error;
+
+ /* Prepare a buffer of ones so that we can accelerate bulk setting. */
+ memset(rtb->words, 0xFF, mp->m_sb.sb_blocksize);
+
+ xrep_rtgroup_btcur_init(sc, &sc->sr);
+ error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_rtbitmap_walk_rtrmap,
+ rtb);
+ if (error)
+ goto out;
+
+ /*
+ * Mark as free every possible rt extent from the last one we saw to
+ * the end of the rt group.
+ */
+ blockcount = rtg->rtg_extents * mp->m_sb.sb_rextsize;
+ if (rtb->next_rgbno < blockcount) {
+ error = xrep_rtbitmap_mark_free(rtb, blockcount);
+ if (error)
+ goto out;
+ }
+
+out:
+ xchk_rtgroup_btcur_free(&sc->sr);
+ return error;
+}
+
+static int
+xrep_rtbitmap_prep_buf(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp,
+ void *data)
+{
+ struct xchk_rtbitmap *rtb = data;
+ struct xfs_mount *mp = sc->mp;
+ union xfs_rtword_raw *ondisk;
+ int error;
+
+ rtb->args.mp = sc->mp;
+ rtb->args.tp = sc->tp;
+ rtb->args.rbmbp = bp;
+ ondisk = xfs_rbmblock_wordptr(&rtb->args, 0);
+ rtb->args.rbmbp = NULL;
+
+ error = xfbmp_copyout(rtb, rtb->prep_wordoff, ondisk,
+ mp->m_blockwsize);
+ if (error)
+ return error;
+
+ if (xfs_has_rtgroups(sc->mp)) {
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+ hdr->rt_owner = cpu_to_be64(sc->ip->i_ino);
+ hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ hdr->rt_lsn = 0;
+ uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ bp->b_ops = &xfs_rtbitmap_buf_ops;
+ } else {
+ bp->b_ops = &xfs_rtbuf_ops;
+ }
+
+ rtb->prep_wordoff += mp->m_blockwsize;
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTBITMAP_BUF);
+ return 0;
+}
+
/*
* Make sure that the given range of the data fork of the realtime file is
* mapped to written blocks. The caller must ensure that the inode is joined
@@ -160,9 +513,18 @@ xrep_rtbitmap(
{
struct xchk_rtbitmap *rtb = sc->buf;
struct xfs_mount *mp = sc->mp;
+ struct xfs_group *xg = rtg_group(sc->sr.rtg);
unsigned long long blocks = 0;
+ unsigned int busy_gen;
int error;
+ /* We require the realtime rmapbt to rebuild anything. */
+ if (!xfs_has_rtrmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
/* Impossibly large rtbitmap means we can't touch the filesystem. */
if (rtb->rbmblocks > U32_MAX)
return 0;
@@ -195,6 +557,79 @@ xrep_rtbitmap(
if (error)
return error;
- /* Fix inconsistent bitmap geometry */
- return xrep_rtbitmap_geometry(sc, rtb);
+ /*
+ * Fix inconsistent bitmap geometry. This function returns with a
+ * clean scrub transaction.
+ */
+ error = xrep_rtbitmap_geometry(sc, rtb);
+ if (error)
+ return error;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice.
+ */
+ if (!xfs_extent_busy_list_empty(xg, &busy_gen)) {
+ error = xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Generate the new rtbitmap data. We don't need the rtbmp information
+ * once this call is finished.
+ */
+ error = xrep_rtbitmap_find_freespace(rtb);
+ if (error)
+ return error;
+
+ /*
+ * Try to take ILOCK_EXCL of the temporary file. We had better be the
+ * only ones holding onto this inode, but we can't block while holding
+ * the rtbitmap file's ILOCK_EXCL.
+ */
+ while (!xrep_tempfile_ilock_nowait(sc)) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ delay(1);
+ }
+
+ /*
+ * Make sure we have space allocated for the part of the bitmap
+ * file that corresponds to this group. We already joined sc->ip.
+ */
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+ error = xrep_tempfile_prealloc(sc, 0, rtb->rbmblocks);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Copy the bitmap file that we generated. */
+ error = xrep_tempfile_copyin(sc, 0, rtb->rbmblocks,
+ xrep_rtbitmap_prep_buf, rtb);
+ if (error)
+ return error;
+ error = xrep_tempfile_set_isize(sc,
+ XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks));
+ if (error)
+ return error;
+
+ /*
+ * Now exchange the data fork contents. We're done with the temporary
+ * buffer, so we can reuse it for the tempfile exchmaps information.
+ */
+ error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0,
+ rtb->rbmblocks, &rtb->tempexch);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_contents(sc, &rtb->tempexch);
+ if (error)
+ return error;
+
+ /* Free the old rtbitmap blocks if they're not in use. */
+ return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
}
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
new file mode 100644
index 000000000000..4c5dffc73641
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -0,0 +1,661 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/repair.h"
+
+/* Set us up with the realtime refcount metadata locked. */
+int
+xchk_setup_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtrefcountbt(sc);
+ if (error)
+ return error;
+ }
+
+ error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+ if (error)
+ return error;
+
+ error = xchk_setup_rt(sc);
+ if (error)
+ return error;
+
+ error = xchk_install_live_inode(sc, rtg_refcount(sc->sr.rtg));
+ if (error)
+ return error;
+
+ return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+}
+
+/* Realtime Reference count btree scrubber. */
+
+/*
+ * Confirming Reference Counts via Reverse Mappings
+ *
+ * We want to count the reverse mappings overlapping a refcount record
+ * (bno, len, refcount), allowing for the possibility that some of the
+ * overlap may come from smaller adjoining reverse mappings, while some
+ * comes from single extents which overlap the range entirely. The
+ * outer loop is as follows:
+ *
+ * 1. For all reverse mappings overlapping the refcount extent,
+ * a. If a given rmap completely overlaps, mark it as seen.
+ * b. Otherwise, record the fragment (in agbno order) for later
+ * processing.
+ *
+ * Once we've seen all the rmaps, we know that for all blocks in the
+ * refcount record we want to find $refcount owners and we've already
+ * visited $seen extents that overlap all the blocks. Therefore, we
+ * need to find ($refcount - $seen) owners for every block in the
+ * extent; call that quantity $target_nr. Proceed as follows:
+ *
+ * 2. Pull the first $target_nr fragments from the list; all of them
+ * should start at or before the start of the extent.
+ * Call this subset of fragments the working set.
+ * 3. Until there are no more unprocessed fragments,
+ * a. Find the shortest fragments in the set and remove them.
+ * b. Note the block number of the end of these fragments.
+ * c. Pull the same number of fragments from the list. All of these
+ * fragments should start at the block number recorded in the
+ * previous step.
+ * d. Put those fragments in the set.
+ * 4. Check that there are $target_nr fragments remaining in the list,
+ * and that they all end at or beyond the end of the refcount extent.
+ *
+ * If the refcount is correct, all the check conditions in the algorithm
+ * should always hold true. If not, the refcount is incorrect.
+ */
+struct xchk_rtrefcnt_frag {
+ struct list_head list;
+ struct xfs_rmap_irec rm;
+};
+
+struct xchk_rtrefcnt_check {
+ struct xfs_scrub *sc;
+ struct list_head fragments;
+
+ /* refcount extent we're examining */
+ xfs_rgblock_t bno;
+ xfs_extlen_t len;
+ xfs_nlink_t refcount;
+
+ /* number of owners seen */
+ xfs_nlink_t seen;
+};
+
+/*
+ * Decide if the given rmap is large enough that we can redeem it
+ * towards refcount verification now, or if it's a fragment, in
+ * which case we'll hang onto it in the hopes that we'll later
+ * discover that we've collected exactly the correct number of
+ * fragments as the rtrefcountbt says we should have.
+ */
+STATIC int
+xchk_rtrefcountbt_rmap_check(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xchk_rtrefcnt_check *refchk = priv;
+ struct xchk_rtrefcnt_frag *frag;
+ xfs_rgblock_t rm_last;
+ xfs_rgblock_t rc_last;
+ int error = 0;
+
+ if (xchk_should_terminate(refchk->sc, &error))
+ return error;
+
+ rm_last = rec->rm_startblock + rec->rm_blockcount - 1;
+ rc_last = refchk->bno + refchk->len - 1;
+
+ /* Confirm that a single-owner refc extent is a CoW stage. */
+ if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) {
+ xchk_btree_xref_set_corrupt(refchk->sc, cur, 0);
+ return 0;
+ }
+
+ if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) {
+ /*
+ * The rmap overlaps the refcount record, so we can confirm
+ * one refcount owner seen.
+ */
+ refchk->seen++;
+ } else {
+ /*
+ * This rmap covers only part of the refcount record, so
+ * save the fragment for later processing. If the rmapbt
+ * is healthy each rmap_irec we see will be in agbno order
+ * so we don't need insertion sort here.
+ */
+ frag = kmalloc(sizeof(struct xchk_rtrefcnt_frag),
+ XCHK_GFP_FLAGS);
+ if (!frag)
+ return -ENOMEM;
+ memcpy(&frag->rm, rec, sizeof(frag->rm));
+ list_add_tail(&frag->list, &refchk->fragments);
+ }
+
+ return 0;
+}
+
+/*
+ * Given a bunch of rmap fragments, iterate through them, keeping
+ * a running tally of the refcount. If this ever deviates from
+ * what we expect (which is the rtrefcountbt's refcount minus the
+ * number of extents that totally covered the rtrefcountbt extent),
+ * we have a rtrefcountbt error.
+ */
+STATIC void
+xchk_rtrefcountbt_process_rmap_fragments(
+ struct xchk_rtrefcnt_check *refchk)
+{
+ struct list_head worklist;
+ struct xchk_rtrefcnt_frag *frag;
+ struct xchk_rtrefcnt_frag *n;
+ xfs_rgblock_t bno;
+ xfs_rgblock_t rbno;
+ xfs_rgblock_t next_rbno;
+ xfs_nlink_t nr;
+ xfs_nlink_t target_nr;
+
+ target_nr = refchk->refcount - refchk->seen;
+ if (target_nr == 0)
+ return;
+
+ /*
+ * There are (refchk->rc.rc_refcount - refchk->nr refcount)
+ * references we haven't found yet. Pull that many off the
+ * fragment list and figure out where the smallest rmap ends
+ * (and therefore the next rmap should start). All the rmaps
+ * we pull off should start at or before the beginning of the
+ * refcount record's range.
+ */
+ INIT_LIST_HEAD(&worklist);
+ rbno = NULLRGBLOCK;
+
+ /* Make sure the fragments actually /are/ in bno order. */
+ bno = 0;
+ list_for_each_entry(frag, &refchk->fragments, list) {
+ if (frag->rm.rm_startblock < bno)
+ goto done;
+ bno = frag->rm.rm_startblock;
+ }
+
+ /*
+ * Find all the rmaps that start at or before the refc extent,
+ * and put them on the worklist.
+ */
+ nr = 0;
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ if (frag->rm.rm_startblock > refchk->bno || nr > target_nr)
+ break;
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (bno < rbno)
+ rbno = bno;
+ list_move_tail(&frag->list, &worklist);
+ nr++;
+ }
+
+ /*
+ * We should have found exactly $target_nr rmap fragments starting
+ * at or before the refcount extent.
+ */
+ if (nr != target_nr)
+ goto done;
+
+ while (!list_empty(&refchk->fragments)) {
+ /* Discard any fragments ending at rbno from the worklist. */
+ nr = 0;
+ next_rbno = NULLRGBLOCK;
+ list_for_each_entry_safe(frag, n, &worklist, list) {
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (bno != rbno) {
+ if (bno < next_rbno)
+ next_rbno = bno;
+ continue;
+ }
+ list_del(&frag->list);
+ kfree(frag);
+ nr++;
+ }
+
+ /* Try to add nr rmaps starting at rbno to the worklist. */
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (frag->rm.rm_startblock != rbno)
+ goto done;
+ list_move_tail(&frag->list, &worklist);
+ if (next_rbno > bno)
+ next_rbno = bno;
+ nr--;
+ if (nr == 0)
+ break;
+ }
+
+ /*
+ * If we get here and nr > 0, this means that we added fewer
+ * items to the worklist than we discarded because the fragment
+ * list ran out of items. Therefore, we cannot maintain the
+ * required refcount. Something is wrong, so we're done.
+ */
+ if (nr)
+ goto done;
+
+ rbno = next_rbno;
+ }
+
+ /*
+ * Make sure the last extent we processed ends at or beyond
+ * the end of the refcount extent.
+ */
+ if (rbno < refchk->bno + refchk->len)
+ goto done;
+
+ /* Actually record us having seen the remaining refcount. */
+ refchk->seen = refchk->refcount;
+done:
+ /* Delete fragments and work list. */
+ list_for_each_entry_safe(frag, n, &worklist, list) {
+ list_del(&frag->list);
+ kfree(frag);
+ }
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ list_del(&frag->list);
+ kfree(frag);
+ }
+}
+
+/* Use the rmap entries covering this extent to verify the refcount. */
+STATIC void
+xchk_rtrefcountbt_xref_rmap(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *irec)
+{
+ struct xchk_rtrefcnt_check refchk = {
+ .sc = sc,
+ .bno = irec->rc_startblock,
+ .len = irec->rc_blockcount,
+ .refcount = irec->rc_refcount,
+ .seen = 0,
+ };
+ struct xfs_rmap_irec low;
+ struct xfs_rmap_irec high;
+ struct xchk_rtrefcnt_frag *frag;
+ struct xchk_rtrefcnt_frag *n;
+ int error;
+
+ if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ /* Cross-reference with the rmapbt to confirm the refcount. */
+ memset(&low, 0, sizeof(low));
+ low.rm_startblock = irec->rc_startblock;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
+
+ INIT_LIST_HEAD(&refchk.fragments);
+ error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+ xchk_rtrefcountbt_rmap_check, &refchk);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ goto out_free;
+
+ xchk_rtrefcountbt_process_rmap_fragments(&refchk);
+ if (irec->rc_refcount != refchk.seen)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+
+out_free:
+ list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
+ list_del(&frag->list);
+ kfree(frag);
+ }
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_rtrefcountbt_xref(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *irec)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xchk_xref_is_used_rt_space(sc,
+ xfs_rgbno_to_rtb(sc->sr.rtg, irec->rc_startblock),
+ irec->rc_blockcount);
+ xchk_rtrefcountbt_xref_rmap(sc, irec);
+}
+
+struct xchk_rtrefcbt_records {
+ /* Previous refcount record. */
+ struct xfs_refcount_irec prev_rec;
+
+ /* The next rtgroup block where we aren't expecting shared extents. */
+ xfs_rgblock_t next_unshared_rgbno;
+
+ /* Number of CoW blocks we expect. */
+ xfs_extlen_t cow_blocks;
+
+ /* Was the last record a shared or CoW staging extent? */
+ enum xfs_refc_domain prev_domain;
+};
+
+static inline bool
+xchk_rtrefcount_mergeable(
+ struct xchk_rtrefcbt_records *rrc,
+ const struct xfs_refcount_irec *r2)
+{
+ const struct xfs_refcount_irec *r1 = &rrc->prev_rec;
+
+ /* Ignore if prev_rec is not yet initialized. */
+ if (r1->rc_blockcount > 0)
+ return false;
+
+ if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock)
+ return false;
+ if (r1->rc_refcount != r2->rc_refcount)
+ return false;
+ if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
+ XFS_REFC_LEN_MAX)
+ return false;
+
+ return true;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rtrefcountbt_check_mergeable(
+ struct xchk_btree *bs,
+ struct xchk_rtrefcbt_records *rrc,
+ const struct xfs_refcount_irec *irec)
+{
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ if (xchk_rtrefcount_mergeable(rrc, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
+}
+
+STATIC int
+xchk_rtrefcountbt_rmap_check_gap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ xfs_rgblock_t *next_bno = priv;
+
+ if (*next_bno != NULLRGBLOCK && rec->rm_startblock < *next_bno)
+ return -ECANCELED;
+
+ *next_bno = rec->rm_startblock + rec->rm_blockcount;
+ return 0;
+}
+
+/*
+ * Make sure that a gap in the reference count records does not correspond to
+ * overlapping records (i.e. shared extents) in the reverse mappings.
+ */
+static inline void
+xchk_rtrefcountbt_xref_gaps(
+ struct xfs_scrub *sc,
+ struct xchk_rtrefcbt_records *rrc,
+ xfs_rtblock_t bno)
+{
+ struct xfs_rmap_irec low;
+ struct xfs_rmap_irec high;
+ xfs_rgblock_t next_bno = NULLRGBLOCK;
+ int error;
+
+ if (bno <= rrc->next_unshared_rgbno || !sc->sr.rmap_cur ||
+ xchk_skip_xref(sc->sm))
+ return;
+
+ memset(&low, 0, sizeof(low));
+ low.rm_startblock = rrc->next_unshared_rgbno;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = bno - 1;
+
+ error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+ xchk_rtrefcountbt_rmap_check_gap, &next_bno);
+ if (error == -ECANCELED)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+ else
+ xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur);
+}
+
+/* Scrub a rtrefcountbt record. */
+STATIC int
+xchk_rtrefcountbt_rec(
+ struct xchk_btree *bs,
+ const union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xchk_rtrefcbt_records *rrc = bs->private;
+ struct xfs_refcount_irec irec;
+ u32 mod;
+
+ xfs_refcount_btrec_to_irec(rec, &irec);
+ if (xfs_rtrefcount_check_irec(to_rtg(bs->cur->bc_group), &irec) !=
+ NULL) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ /* We can only share full rt extents. */
+ mod = xfs_rgbno_to_rtxoff(mp, irec.rc_startblock);
+ if (mod)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ mod = xfs_extlen_to_rtxmod(mp, irec.rc_blockcount);
+ if (mod)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
+ rrc->cow_blocks += irec.rc_blockcount;
+
+ /* Shared records always come before CoW records. */
+ if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED &&
+ rrc->prev_domain == XFS_REFC_DOMAIN_COW)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ rrc->prev_domain = irec.rc_domain;
+
+ xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec);
+ xchk_rtrefcountbt_xref(bs->sc, &irec);
+
+ /*
+ * If this is a record for a shared extent, check that all blocks
+ * between the previous record and this one have at most one reverse
+ * mapping.
+ */
+ if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) {
+ xchk_rtrefcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock);
+ rrc->next_unshared_rgbno = irec.rc_startblock +
+ irec.rc_blockcount;
+ }
+
+ return 0;
+}
+
+/* Make sure we have as many refc blocks as the rmap says. */
+STATIC void
+xchk_refcount_xref_rmap(
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *btree_oinfo,
+ xfs_extlen_t cow_blocks)
+{
+ xfs_filblks_t refcbt_blocks = 0;
+ xfs_filblks_t blocks;
+ int error;
+
+ if (!sc->sr.rmap_cur || !sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ /* Check that we saw as many refcbt blocks as the rmap knows about. */
+ error = xfs_btree_count_blocks(sc->sr.refc_cur, &refcbt_blocks);
+ if (!xchk_btree_process_error(sc, sc->sr.refc_cur, 0, &error))
+ return;
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, btree_oinfo,
+ &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (blocks != refcbt_blocks)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+
+ /* Check that we saw as many cow blocks as the rmap knows about. */
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sr.rmap_cur,
+ &XFS_RMAP_OINFO_COW, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ return;
+ if (blocks != cow_blocks)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* Scrub the refcount btree for some AG. */
+int
+xchk_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ struct xfs_owner_info btree_oinfo;
+ struct xchk_rtrefcbt_records rrc = {
+ .cow_blocks = 0,
+ .next_unshared_rgbno = 0,
+ .prev_domain = XFS_REFC_DOMAIN_SHARED,
+ };
+ int error;
+
+ error = xchk_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ xfs_rmap_ino_bmbt_owner(&btree_oinfo, rtg_refcount(sc->sr.rtg)->i_ino,
+ XFS_DATA_FORK);
+ error = xchk_btree(sc, sc->sr.refc_cur, xchk_rtrefcountbt_rec,
+ &btree_oinfo, &rrc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ /*
+ * Check that all blocks between the last refcount > 1 record and the
+ * end of the rt volume have at most one reverse mapping.
+ */
+ xchk_rtrefcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_rblocks);
+
+ xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks);
+
+ return 0;
+}
+
+/* xref check that a cow staging extent is marked in the rtrefcountbt. */
+void
+xchk_xref_is_rt_cow_staging(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ struct xfs_refcount_irec rc;
+ int has_refcount;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ /* Find the CoW staging extent. */
+ error = xfs_refcount_lookup_le(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+ bno, &has_refcount);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (!has_refcount) {
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+ return;
+ }
+
+ error = xfs_refcount_get_rec(sc->sr.refc_cur, &rc, &has_refcount);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (!has_refcount) {
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+ return;
+ }
+
+ /* CoW lookup returned a shared extent record? */
+ if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+
+ /* Must be at least as long as what was passed in */
+ if (rc.rc_blockcount < len)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/*
+ * xref check that the extent is not shared. Only file data blocks
+ * can have multiple owners.
+ */
+void
+xchk_xref_is_not_rt_shared(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_refcount_has_records(sc->sr.refc_cur,
+ XFS_REFC_DOMAIN_SHARED, bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/* xref check that the extent is not being used for CoW staging. */
+void
+xchk_xref_is_not_rt_cow_staging(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_refcount_has_records(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+ bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c
new file mode 100644
index 000000000000..257cfb24beb4
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount_repair.c
@@ -0,0 +1,783 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+#include "scrub/rcbag.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ * ====================================
+ *
+ * This algorithm is "borrowed" from xfs_repair. Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ * - ---
+ * -- ----- ---- --- ------
+ * -- ---- ----------- ---- ---------
+ * -------------------------------- -----------
+ * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
+ * 2 1 23 21 3 43 234 2123 1 01 2 3 0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time. Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * rt refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ * - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ * - Add to the bag all rmaps in the array where startblock == sp.
+ * - Set np to the physical block where the bag size will change. This
+ * is the minimum of (the pblk of the next unprocessed rmap) and
+ * (startblock + len of each rmap in the bag).
+ * - Record the bag size as old_bag_size.
+ *
+ * - While the bag isn't empty,
+ * - Remove from the bag all rmaps where startblock + len == np.
+ * - Add to the bag all rmaps in the array where startblock == np.
+ * - If the bag size isn't old_bag_size, store the refcount entry
+ * (sp, np - sp, bag_size) in the refcnt btree.
+ * - If the bag is empty, break out of the inner loop.
+ * - Set old_bag_size to the bag size
+ * - Set sp = np.
+ * - Set np to the physical block where the bag size will change.
+ * This is the minimum of (the pblk of the next unprocessed rmap)
+ * and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the rt refcount btree root and
+ * insert all the records.
+ */
+
+struct xrep_rtrefc {
+ /* refcount extents */
+ struct xfarray *refcount_records;
+
+ /* new refcountbt information */
+ struct xrep_newbt new_btree;
+
+ /* old refcountbt blocks */
+ struct xfsb_bitmap old_rtrefcountbt_blocks;
+
+ struct xfs_scrub *sc;
+
+ /* get_records()'s position in the rt refcount record array. */
+ xfarray_idx_t array_cur;
+
+ /* # of refcountbt blocks */
+ xfs_filblks_t btblocks;
+};
+
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ char *descr;
+ int error;
+
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ return error;
+}
+
+/* Check for any obvious conflicts with this shared/CoW staging extent. */
+STATIC int
+xrep_rtrefc_check_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *rec)
+{
+ xfs_rgblock_t last;
+
+ if (xfs_rtrefcount_check_irec(sc->sr.rtg, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ if (xfs_rgbno_to_rtxoff(sc->mp, rec->rc_startblock) != 0)
+ return -EFSCORRUPTED;
+
+ last = rec->rc_startblock + rec->rc_blockcount - 1;
+ if (xfs_rgbno_to_rtxoff(sc->mp, last) != sc->mp->m_sb.sb_rextsize - 1)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space or misaligned. */
+ return xrep_require_rtext_inuse(sc, rec->rc_startblock,
+ rec->rc_blockcount);
+}
+
+/* Record a reference count extent. */
+STATIC int
+xrep_rtrefc_stash(
+ struct xrep_rtrefc *rr,
+ enum xfs_refc_domain domain,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len,
+ uint64_t refcount)
+{
+ struct xfs_refcount_irec irec = {
+ .rc_startblock = bno,
+ .rc_blockcount = len,
+ .rc_refcount = refcount,
+ .rc_domain = domain,
+ };
+ int error = 0;
+
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
+
+ error = xrep_rtrefc_check_ext(rr->sc, &irec);
+ if (error)
+ return error;
+
+ trace_xrep_refc_found(rtg_group(rr->sc->sr.rtg), &irec);
+
+ return xfarray_append(rr->refcount_records, &irec);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrefc_stash_cow(
+ struct xrep_rtrefc *rr,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ return xrep_rtrefc_stash(rr, XFS_REFC_DOMAIN_COW, bno, len, 1);
+}
+
+/* Decide if an rmap could describe a shared extent. */
+static inline bool
+xrep_rtrefc_rmap_shareable(
+ const struct xfs_rmap_irec *rmap)
+{
+ /* rt metadata are never sharable */
+ if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return false;
+
+ /* Unwritten file blocks are not shareable. */
+ if (rmap->rm_flags & XFS_RMAP_UNWRITTEN)
+ return false;
+
+ return true;
+}
+
+/* Grab the next (abbreviated) rmap record from the rmapbt. */
+STATIC int
+xrep_rtrefc_walk_rmaps(
+ struct xrep_rtrefc *rr,
+ struct xfs_rmap_irec *rmap,
+ bool *have_rec)
+{
+ struct xfs_btree_cur *cur = rr->sc->sr.rmap_cur;
+ struct xfs_mount *mp = cur->bc_mp;
+ int have_gt;
+ int error = 0;
+
+ *have_rec = false;
+
+ /*
+ * Loop through the remaining rmaps. Remember CoW staging
+ * extents and the refcountbt blocks from the old tree for later
+ * disposal. We can only share written data fork extents, so
+ * keep looping until we find an rmap for one.
+ */
+ do {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (!have_gt)
+ return 0;
+
+ error = xfs_rmap_get_rec(cur, rmap, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, !have_gt)) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+
+ if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_rtrefc_stash_cow(rr, rmap->rm_startblock,
+ rmap->rm_blockcount);
+ if (error)
+ return error;
+ } else if (xfs_is_sb_inum(mp, rmap->rm_owner) ||
+ (rmap->rm_flags & (XFS_RMAP_ATTR_FORK |
+ XFS_RMAP_BMBT_BLOCK))) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+ } while (!xrep_rtrefc_rmap_shareable(rmap));
+
+ *have_rec = true;
+ return 0;
+}
+
+static inline uint32_t
+xrep_rtrefc_encode_startblock(
+ const struct xfs_refcount_irec *irec)
+{
+ uint32_t start;
+
+ start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
+/*
+ * Compare two refcount records. We want to sort in order of increasing block
+ * number.
+ */
+static int
+xrep_rtrefc_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_refcount_irec *ap = a;
+ const struct xfs_refcount_irec *bp = b;
+ uint32_t sa, sb;
+
+ sa = xrep_rtrefc_encode_startblock(ap);
+ sb = xrep_rtrefc_encode_startblock(bp);
+
+ if (sa > sb)
+ return 1;
+ if (sa < sb)
+ return -1;
+ return 0;
+}
+
+/*
+ * Sort the refcount extents by startblock or else the btree records will be in
+ * the wrong order. Make sure the records do not overlap in physical space.
+ */
+STATIC int
+xrep_rtrefc_sort_records(
+ struct xrep_rtrefc *rr)
+{
+ struct xfs_refcount_irec irec;
+ xfarray_idx_t cur;
+ enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED;
+ xfs_rgblock_t next_rgbno = 0;
+ int error;
+
+ error = xfarray_sort(rr->refcount_records, xrep_rtrefc_extent_cmp,
+ XFARRAY_SORT_KILLABLE);
+ if (error)
+ return error;
+
+ foreach_xfarray_idx(rr->refcount_records, cur) {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfarray_load(rr->refcount_records, cur, &irec);
+ if (error)
+ return error;
+
+ if (dom == XFS_REFC_DOMAIN_SHARED &&
+ irec.rc_domain == XFS_REFC_DOMAIN_COW) {
+ dom = irec.rc_domain;
+ next_rgbno = 0;
+ }
+
+ if (dom != irec.rc_domain)
+ return -EFSCORRUPTED;
+ if (irec.rc_startblock < next_rgbno)
+ return -EFSCORRUPTED;
+
+ next_rgbno = irec.rc_startblock + irec.rc_blockcount;
+ }
+
+ return error;
+}
+
+/* Record extents that belong to the realtime refcount inode. */
+STATIC int
+xrep_rtrefc_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rtrefc *rr = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ /* Skip extents which are not owned by this inode and fork. */
+ if (rec->rm_owner != rr->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_check_ino_btree_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ return xfsb_bitmap_set(&rr->old_rtrefcountbt_blocks,
+ xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock),
+ rec->rm_blockcount);
+}
+
+/*
+ * Walk forward through the rmap btree to collect all rmaps starting at
+ * @bno in @rmap_bag. These represent the file(s) that share ownership of
+ * the current block. Upon return, the rmap cursor points to the last record
+ * satisfying the startblock constraint.
+ */
+static int
+xrep_rtrefc_push_rmaps_at(
+ struct xrep_rtrefc *rr,
+ struct rcbag *rcstack,
+ xfs_rgblock_t bno,
+ struct xfs_rmap_irec *rmap,
+ bool *have)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int have_gt;
+ int error;
+
+ while (*have && rmap->rm_startblock == bno) {
+ error = rcbag_add(rcstack, rr->sc->tp, rmap);
+ if (error)
+ return error;
+
+ error = xrep_rtrefc_walk_rmaps(rr, rmap, have);
+ if (error)
+ return error;
+ }
+
+ error = xfs_btree_decrement(sc->sr.rmap_cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+ xfs_btree_mark_sick(sc->sr.rmap_cur);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/* Scan one AG for reverse mappings for the realtime refcount btree. */
+STATIC int
+xrep_rtrefc_scan_ag(
+ struct xrep_rtrefc *rr,
+ struct xfs_perag *pag)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrefc_walk_rmap, rr);
+ xchk_ag_free(sc, &sc->sa);
+ return error;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+STATIC int
+xrep_rtrefc_find_refcounts(
+ struct xrep_rtrefc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct rcbag *rcstack;
+ struct xfs_perag *pag = NULL;
+ uint64_t old_stack_height;
+ xfs_rgblock_t sbno;
+ xfs_rgblock_t cbno;
+ xfs_rgblock_t nbno;
+ bool have;
+ int error;
+
+ /* Scan for old rtrefc btree blocks. */
+ while ((pag = xfs_perag_next(sc->mp, pag))) {
+ error = xrep_rtrefc_scan_ag(rr, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ xrep_rtgroup_btcur_init(sc, &sc->sr);
+
+ /*
+ * Set up a bag to store all the rmap records that we're tracking to
+ * generate a reference count record. If this exceeds
+ * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
+ */
+ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
+ if (error)
+ goto out_cur;
+
+ /* Start the rtrmapbt cursor to the left of all records. */
+ error = xfs_btree_goto_left_edge(sc->sr.rmap_cur);
+ if (error)
+ goto out_bag;
+
+ /* Process reverse mappings into refcount data. */
+ while (xfs_btree_has_more_records(sc->sr.rmap_cur)) {
+ struct xfs_rmap_irec rmap;
+
+ /* Push all rmaps with pblk == sbno onto the stack */
+ error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+ if (error)
+ goto out_bag;
+ if (!have)
+ break;
+ sbno = cbno = rmap.rm_startblock;
+ error = xrep_rtrefc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+ &have);
+ if (error)
+ goto out_bag;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ old_stack_height = rcbag_count(rcstack);
+
+ /* While stack isn't empty... */
+ while (rcbag_count(rcstack) > 0) {
+ /* Pop all rmaps that end at nbno */
+ error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
+ if (error)
+ goto out_bag;
+
+ /* Push array items that start at nbno */
+ error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+ if (error)
+ goto out_bag;
+ if (have) {
+ error = xrep_rtrefc_push_rmaps_at(rr, rcstack,
+ nbno, &rmap, &have);
+ if (error)
+ goto out_bag;
+ }
+
+ /* Emit refcount if necessary */
+ ASSERT(nbno > cbno);
+ if (rcbag_count(rcstack) != old_stack_height) {
+ if (old_stack_height > 1) {
+ error = xrep_rtrefc_stash(rr,
+ XFS_REFC_DOMAIN_SHARED,
+ cbno, nbno - cbno,
+ old_stack_height);
+ if (error)
+ goto out_bag;
+ }
+ cbno = nbno;
+ }
+
+ /* Stack empty, go find the next rmap */
+ if (rcbag_count(rcstack) == 0)
+ break;
+ old_stack_height = rcbag_count(rcstack);
+ sbno = nbno;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
+ &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ }
+ }
+
+ ASSERT(rcbag_count(rcstack) == 0);
+out_bag:
+ rcbag_free(&rcstack);
+out_cur:
+ xchk_rtgroup_btcur_free(&sc->sr);
+ return error;
+}
+
+/* Retrieve refcountbt data for bulk load. */
+STATIC int
+xrep_rtrefc_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xrep_rtrefc *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load(rr->refcount_records, rr->array_cur++,
+ &cur->bc_rec.rc);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrefc_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_rtrefc *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrefc_iroot_size(
+ struct xfs_btree_cur *cur,
+ unsigned int level,
+ unsigned int nr_this_level,
+ void *priv)
+{
+ return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level,
+ nr_this_level);
+}
+
+/*
+ * Use the collected refcount information to stage a new rt refcount btree. If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_rtrefc_build_new_tree(
+ struct xrep_rtrefc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_btree_cur *refc_cur;
+ int error;
+
+ error = xrep_rtrefc_sort_records(rr);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the realtime refcount inode.
+ */
+ error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
+ if (error)
+ return error;
+
+ rr->new_btree.bload.get_records = xrep_rtrefc_get_records;
+ rr->new_btree.bload.claim_block = xrep_rtrefc_claim_block;
+ rr->new_btree.bload.iroot_size = xrep_rtrefc_iroot_size;
+
+ refc_cur = xfs_rtrefcountbt_init_cursor(NULL, rtg);
+ xfs_btree_stage_ifakeroot(refc_cur, &rr->new_btree.ifake);
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload,
+ xfarray_length(rr->refcount_records));
+ if (error)
+ goto err_cur;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /*
+ * Guess how many blocks we're going to need to rebuild an entire
+ * rtrefcountbt from the number of extents we found, and pump up our
+ * transaction to have sufficient block reservation. We're allowed
+ * to exceed quota to repair inconsistent metadata, though this is
+ * unlikely.
+ */
+ error = xfs_trans_reserve_more_inode(sc->tp, rtg_refcount(rtg),
+ rr->new_btree.bload.nr_blocks, 0, true);
+ if (error)
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ rr->new_btree.bload.nr_blocks);
+ if (error)
+ goto err_cur;
+
+ /* Add all observed refcount records. */
+ rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE;
+ rr->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Install the new rtrefc btree in the inode. After this point the old
+ * btree is no longer accessible, the new tree is live, and we can
+ * delete the cursor.
+ */
+ xfs_rtrefcountbt_commit_staged_btree(refc_cur, sc->tp);
+ xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
+ xfs_btree_del_cursor(refc_cur, 0);
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_trans(sc);
+err_cur:
+ xfs_btree_del_cursor(refc_cur, error);
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_rtrefc_remove_old_tree(
+ struct xrep_rtrefc *rr)
+{
+ int error;
+
+ /*
+ * Free all the extents that were allocated to the former rtrefcountbt
+ * and aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc,
+ &rr->old_rtrefcountbt_blocks);
+ if (error)
+ return error;
+
+ /*
+ * Ensure the proper reservation for the rtrefcount inode so that we
+ * don't fail to expand the btree.
+ */
+ return xrep_reset_metafile_resv(rr->sc);
+}
+
+/* Rebuild the rt refcount btree. */
+int
+xrep_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rtrefc *rr;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rtrmapbt(mp))
+ return -EOPNOTSUPP;
+
+ /* Make sure any problems with the fork are fixed. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ rr = kzalloc(sizeof(struct xrep_rtrefc), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+ rr->sc = sc;
+
+ /* Set up enough storage to handle one refcount record per rt extent. */
+ descr = xchk_xfile_ag_descr(sc, "reference count records");
+ error = xfarray_create(descr, mp->m_sb.sb_rextents,
+ sizeof(struct xfs_refcount_irec),
+ &rr->refcount_records);
+ kfree(descr);
+ if (error)
+ goto out_rr;
+
+ /* Collect all reference counts. */
+ xfsb_bitmap_init(&rr->old_rtrefcountbt_blocks);
+ error = xrep_rtrefc_find_refcounts(rr);
+ if (error)
+ goto out_bitmap;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Rebuild the refcount information. */
+ error = xrep_rtrefc_build_new_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_rtrefc_remove_old_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xfsb_bitmap_destroy(&rr->old_rtrefcountbt_blocks);
+ xfarray_destroy(rr->refcount_records);
+out_rr:
+ kfree(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
new file mode 100644
index 000000000000..12989fe80e8b
--- /dev/null
+++ b/fs/xfs/scrub/rtrmap.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
+#include "xfs_refcount.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Set us up with the realtime metadata locked. */
+int
+xchk_setup_rtrmapbt(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtrmapbt(sc);
+ if (error)
+ return error;
+ }
+
+ error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+ if (error)
+ return error;
+
+ error = xchk_setup_rt(sc);
+ if (error)
+ return error;
+
+ error = xchk_install_live_inode(sc, rtg_rmap(sc->sr.rtg));
+ if (error)
+ return error;
+
+ return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+}
+
+/* Realtime reverse mapping. */
+
+struct xchk_rtrmap {
+ /*
+ * The furthest-reaching of the rmapbt records that we've already
+ * processed. This enables us to detect overlapping records for space
+ * allocations that cannot be shared.
+ */
+ struct xfs_rmap_irec overlap_rec;
+
+ /*
+ * The previous rmapbt record, so that we can check for two records
+ * that could be one.
+ */
+ struct xfs_rmap_irec prev_rec;
+};
+
+static inline bool
+xchk_rtrmapbt_is_shareable(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *irec)
+{
+ if (!xfs_has_rtreflink(sc->mp))
+ return false;
+ if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+ return false;
+ return true;
+}
+
+/* Flag failures for records that overlap but cannot. */
+STATIC void
+xchk_rtrmapbt_check_overlapping(
+ struct xchk_btree *bs,
+ struct xchk_rtrmap *cr,
+ const struct xfs_rmap_irec *irec)
+{
+ xfs_rtblock_t pnext, inext;
+
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ /* No previous record? */
+ if (cr->overlap_rec.rm_blockcount == 0)
+ goto set_prev;
+
+ /* Do overlap_rec and irec overlap? */
+ pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount;
+ if (pnext <= irec->rm_startblock)
+ goto set_prev;
+
+ /* Overlap is only allowed if both records are data fork mappings. */
+ if (!xchk_rtrmapbt_is_shareable(bs->sc, &cr->overlap_rec) ||
+ !xchk_rtrmapbt_is_shareable(bs->sc, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ /* Save whichever rmap record extends furthest. */
+ inext = irec->rm_startblock + irec->rm_blockcount;
+ if (pnext > inext)
+ return;
+
+set_prev:
+ memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Decide if two reverse-mapping records can be merged. */
+static inline bool
+xchk_rtrmap_mergeable(
+ struct xchk_rtrmap *cr,
+ const struct xfs_rmap_irec *r2)
+{
+ const struct xfs_rmap_irec *r1 = &cr->prev_rec;
+
+ /* Ignore if prev_rec is not yet initialized. */
+ if (cr->prev_rec.rm_blockcount == 0)
+ return false;
+
+ if (r1->rm_owner != r2->rm_owner)
+ return false;
+ if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock)
+ return false;
+ if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount >
+ XFS_RMAP_LEN_MAX)
+ return false;
+ if (r1->rm_flags != r2->rm_flags)
+ return false;
+ return r1->rm_offset + r1->rm_blockcount == r2->rm_offset;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rtrmapbt_check_mergeable(
+ struct xchk_btree *bs,
+ struct xchk_rtrmap *cr,
+ const struct xfs_rmap_irec *irec)
+{
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ if (xchk_rtrmap_mergeable(cr, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Cross-reference a rmap against the refcount btree. */
+STATIC void
+xchk_rtrmapbt_xref_rtrefc(
+ struct xfs_scrub *sc,
+ struct xfs_rmap_irec *irec)
+{
+ xfs_rgblock_t fbno;
+ xfs_extlen_t flen;
+ bool is_inode;
+ bool is_bmbt;
+ bool is_attr;
+ bool is_unwritten;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+ is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+ is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+ is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+ /* If this is shared, must be a data fork extent. */
+ error = xfs_refcount_find_shared(sc->sr.refc_cur, irec->rm_startblock,
+ irec->rm_blockcount, &fbno, &flen, false);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (flen != 0 && (!is_inode || is_attr || is_bmbt || is_unwritten))
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/* Cross-reference with other metadata. */
+STATIC void
+xchk_rtrmapbt_xref(
+ struct xfs_scrub *sc,
+ struct xfs_rmap_irec *irec)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xchk_xref_is_used_rt_space(sc,
+ xfs_rgbno_to_rtb(sc->sr.rtg, irec->rm_startblock),
+ irec->rm_blockcount);
+ if (irec->rm_owner == XFS_RMAP_OWN_COW)
+ xchk_xref_is_cow_staging(sc, irec->rm_startblock,
+ irec->rm_blockcount);
+ else
+ xchk_rtrmapbt_xref_rtrefc(sc, irec);
+}
+
+/* Scrub a realtime rmapbt record. */
+STATIC int
+xchk_rtrmapbt_rec(
+ struct xchk_btree *bs,
+ const union xfs_btree_rec *rec)
+{
+ struct xchk_rtrmap *cr = bs->private;
+ struct xfs_rmap_irec irec;
+
+ if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
+ xfs_rtrmap_check_irec(to_rtg(bs->cur->bc_group), &irec) != NULL) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ xchk_rtrmapbt_check_mergeable(bs, cr, &irec);
+ xchk_rtrmapbt_check_overlapping(bs, cr, &irec);
+ xchk_rtrmapbt_xref(bs->sc, &irec);
+ return 0;
+}
+
+/* Scrub the realtime rmap btree. */
+int
+xchk_rtrmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xfs_inode *ip = rtg_rmap(sc->sr.rtg);
+ struct xfs_owner_info oinfo;
+ struct xchk_rtrmap cr = { };
+ int error;
+
+ error = xchk_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, XFS_DATA_FORK);
+ return xchk_btree(sc, sc->sr.rmap_cur, xchk_rtrmapbt_rec, &oinfo, &cr);
+}
+
+/* xref check that the extent has no realtime reverse mapping at all */
+void
+xchk_xref_has_no_rt_owner(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* xref check that the extent is completely mapped */
+void
+xchk_xref_has_rt_owner(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_FULL)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* xref check that the extent is only owned by a given owner */
+void
+xchk_xref_is_only_rt_owned_by(
+ struct xfs_scrub *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_rmap_matches res;
+ int error;
+
+ if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_rmap_count_owners(sc->sr.rmap_cur, bno, len, oinfo, &res);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ return;
+ if (res.matches != 1)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+ if (res.bad_non_owner_matches)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+ if (res.non_owner_matches)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
new file mode 100644
index 000000000000..f2fdd7a9fc24
--- /dev/null
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -0,0 +1,1006 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "xfs_refcount.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/rgb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Realtime Reverse Mapping Btree Repair
+ * =====================================
+ *
+ * This isn't quite as difficult as repairing the rmap btree on the data
+ * device, since we only store the data fork extents of realtime files on the
+ * realtime device. We still have to freeze the filesystem and stop the
+ * background threads like we do for the rmap repair, but we only have to scan
+ * realtime inodes.
+ *
+ * Collecting entries for the new realtime rmap btree is easy -- all we have
+ * to do is generate rtrmap entries from the data fork mappings of all realtime
+ * files in the filesystem. We then scan the rmap btrees of the data device
+ * looking for extents belonging to the old btree and note them in a bitmap.
+ *
+ * To rebuild the realtime rmap btree, we bulk-load the collected mappings into
+ * a new btree cursor and atomically swap that into the realtime inode. Then
+ * we can free the blocks from the old btree.
+ *
+ * We use the 'xrep_rtrmap' prefix for all the rmap functions.
+ */
+
+/* Context for collecting rmaps */
+struct xrep_rtrmap {
+ /* new rtrmapbt information */
+ struct xrep_newbt new_btree;
+
+ /* lock for the xfbtree and xfile */
+ struct mutex lock;
+
+ /* rmap records generated from primary metadata */
+ struct xfbtree rtrmap_btree;
+
+ struct xfs_scrub *sc;
+
+ /* bitmap of old rtrmapbt blocks */
+ struct xfsb_bitmap old_rtrmapbt_blocks;
+
+ /* Hooks into rtrmap update code. */
+ struct xfs_rmap_hook rhook;
+
+ /* inode scan cursor */
+ struct xchk_iscan iscan;
+
+ /* in-memory btree cursor for the ->get_blocks walk */
+ struct xfs_btree_cur *mcur;
+
+ /* Number of records we're staging in the new btree. */
+ uint64_t nr_records;
+};
+
+/* Set us up to repair rt reverse mapping btrees. */
+int
+xrep_setup_rtrmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rtrmap *rr;
+ char *descr;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
+
+ descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ if (error)
+ return error;
+
+ rr = kzalloc(sizeof(struct xrep_rtrmap), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+
+ rr->sc = sc;
+ sc->buf = rr;
+ return 0;
+}
+
+/* Make sure there's nothing funny about this mapping. */
+STATIC int
+xrep_rtrmap_check_mapping(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec)
+{
+ if (xfs_rtrmap_check_irec(sc->sr.rtg, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ return xrep_require_rtext_inuse(sc, rec->rm_startblock,
+ rec->rm_blockcount);
+}
+
+/* Store a reverse-mapping record. */
+static inline int
+xrep_rtrmap_stash(
+ struct xrep_rtrmap *rr,
+ xfs_rgblock_t startblock,
+ xfs_extlen_t blockcount,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ struct xfs_rmap_irec rmap = {
+ .rm_startblock = startblock,
+ .rm_blockcount = blockcount,
+ .rm_owner = owner,
+ .rm_offset = offset,
+ .rm_flags = flags,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_btree_cur *mcur;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ trace_xrep_rtrmap_found(sc->mp, &rmap);
+
+ /* Add entry to in-memory btree. */
+ mutex_lock(&rr->lock);
+ mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, &rr->rtrmap_btree);
+ error = xfs_rmap_map_raw(mcur, &rmap);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rtrmap_btree, sc->tp);
+ if (error)
+ goto out_abort;
+
+ mutex_unlock(&rr->lock);
+ return 0;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rtrmap_btree, sc->tp);
+out_abort:
+ xchk_iscan_abort(&rr->iscan);
+ mutex_unlock(&rr->lock);
+ return error;
+}
+
+/* Finding all file and bmbt extents. */
+
+/* Context for accumulating rmaps for an inode fork. */
+struct xrep_rtrmap_ifork {
+ /*
+ * Accumulate rmap data here to turn multiple adjacent bmaps into a
+ * single rmap.
+ */
+ struct xfs_rmap_irec accum;
+
+ struct xrep_rtrmap *rr;
+};
+
+/* Stash an rmap that we accumulated while walking an inode fork. */
+STATIC int
+xrep_rtrmap_stash_accumulated(
+ struct xrep_rtrmap_ifork *rf)
+{
+ if (rf->accum.rm_blockcount == 0)
+ return 0;
+
+ return xrep_rtrmap_stash(rf->rr, rf->accum.rm_startblock,
+ rf->accum.rm_blockcount, rf->accum.rm_owner,
+ rf->accum.rm_offset, rf->accum.rm_flags);
+}
+
+/* Accumulate a bmbt record. */
+STATIC int
+xrep_rtrmap_visit_bmbt(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *rec,
+ void *priv)
+{
+ struct xrep_rtrmap_ifork *rf = priv;
+ struct xfs_rmap_irec *accum = &rf->accum;
+ struct xfs_mount *mp = rf->rr->sc->mp;
+ xfs_rgblock_t rgbno;
+ unsigned int rmap_flags = 0;
+ int error;
+
+ if (xfs_rtb_to_rgno(mp, rec->br_startblock) !=
+ rtg_rgno(rf->rr->sc->sr.rtg))
+ return 0;
+
+ if (rec->br_state == XFS_EXT_UNWRITTEN)
+ rmap_flags |= XFS_RMAP_UNWRITTEN;
+
+ /* If this bmap is adjacent to the previous one, just add it. */
+ rgbno = xfs_rtb_to_rgbno(mp, rec->br_startblock);
+ if (accum->rm_blockcount > 0 &&
+ rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
+ rgbno == accum->rm_startblock + accum->rm_blockcount &&
+ rmap_flags == accum->rm_flags) {
+ accum->rm_blockcount += rec->br_blockcount;
+ return 0;
+ }
+
+ /* Otherwise stash the old rmap and start accumulating a new one. */
+ error = xrep_rtrmap_stash_accumulated(rf);
+ if (error)
+ return error;
+
+ accum->rm_startblock = rgbno;
+ accum->rm_blockcount = rec->br_blockcount;
+ accum->rm_offset = rec->br_startoff;
+ accum->rm_flags = rmap_flags;
+ return 0;
+}
+
+/*
+ * Iterate the block mapping btree to collect rmap records for anything in this
+ * fork that maps to the rt volume. Sets @mappings_done to true if we've
+ * scanned the block mappings in this fork.
+ */
+STATIC int
+xrep_rtrmap_scan_bmbt(
+ struct xrep_rtrmap_ifork *rf,
+ struct xfs_inode *ip,
+ bool *mappings_done)
+{
+ struct xrep_rtrmap *rr = rf->rr;
+ struct xfs_btree_cur *cur;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ int error = 0;
+
+ *mappings_done = false;
+
+ /*
+ * If the incore extent cache is already loaded, we'll just use the
+ * incore extent scanner to record mappings. Don't bother walking the
+ * ondisk extent tree.
+ */
+ if (!xfs_need_iread_extents(ifp))
+ return 0;
+
+ /* Accumulate all the mappings in the bmap btree. */
+ cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, XFS_DATA_FORK);
+ error = xfs_bmap_query_all(cur, xrep_rtrmap_visit_bmbt, rf);
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ return error;
+
+ /* Stash any remaining accumulated rmaps and exit. */
+ *mappings_done = true;
+ return xrep_rtrmap_stash_accumulated(rf);
+}
+
+/*
+ * Iterate the in-core extent cache to collect rmap records for anything in
+ * this fork that matches the AG.
+ */
+STATIC int
+xrep_rtrmap_scan_iext(
+ struct xrep_rtrmap_ifork *rf,
+ struct xfs_ifork *ifp)
+{
+ struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
+ int error;
+
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ error = xrep_rtrmap_visit_bmbt(NULL, &rec, rf);
+ if (error)
+ return error;
+ }
+
+ return xrep_rtrmap_stash_accumulated(rf);
+}
+
+/* Find all the extents on the realtime device mapped by an inode fork. */
+STATIC int
+xrep_rtrmap_scan_dfork(
+ struct xrep_rtrmap *rr,
+ struct xfs_inode *ip)
+{
+ struct xrep_rtrmap_ifork rf = {
+ .accum = { .rm_owner = ip->i_ino, },
+ .rr = rr,
+ };
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ int error = 0;
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ bool mappings_done;
+
+ /*
+ * Scan the bmbt for mappings. If the incore extent tree is
+ * loaded, we want to scan the cached mappings since that's
+ * faster when the extent counts are very high.
+ */
+ error = xrep_rtrmap_scan_bmbt(&rf, ip, &mappings_done);
+ if (error || mappings_done)
+ return error;
+ } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+ /* realtime data forks should only be extents or btree */
+ return -EFSCORRUPTED;
+ }
+
+ /* Scan incore extent cache. */
+ return xrep_rtrmap_scan_iext(&rf, ifp);
+}
+
+/* Record reverse mappings for a file. */
+STATIC int
+xrep_rtrmap_scan_inode(
+ struct xrep_rtrmap *rr,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode;
+ int error = 0;
+
+ /* Skip the rt rmap btree inode. */
+ if (rr->sc->ip == ip)
+ return 0;
+
+ lock_mode = xfs_ilock_data_map_shared(ip);
+
+ /* Check the data fork if it's on the realtime device. */
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ error = xrep_rtrmap_scan_dfork(rr, ip);
+ if (error)
+ goto out_unlock;
+ }
+
+ xchk_iscan_mark_visited(&rr->iscan, ip);
+out_unlock:
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/* Record extents that belong to the realtime rmap inode. */
+STATIC int
+xrep_rtrmap_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rtrmap *rr = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ /* Skip extents which are not owned by this inode and fork. */
+ if (rec->rm_owner != rr->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_check_ino_btree_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ return xfsb_bitmap_set(&rr->old_rtrmapbt_blocks,
+ xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock),
+ rec->rm_blockcount);
+}
+
+/* Scan one AG for reverse mappings for the realtime rmap btree. */
+STATIC int
+xrep_rtrmap_scan_ag(
+ struct xrep_rtrmap *rr,
+ struct xfs_perag *pag)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrmap_walk_rmap, rr);
+ xchk_ag_free(sc, &sc->sa);
+ return error;
+}
+
+struct xrep_rtrmap_stash_run {
+ struct xrep_rtrmap *rr;
+ uint64_t owner;
+};
+
+static int
+xrep_rtrmap_stash_run(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_rtrmap_stash_run *rsr = priv;
+ struct xrep_rtrmap *rr = rsr->rr;
+ xfs_rgblock_t rgbno = start;
+
+ return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rtrmap_stash_bitmap(
+ struct xrep_rtrmap *rr,
+ struct xrgb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xrep_rtrmap_stash_run rsr = {
+ .rr = rr,
+ .owner = oinfo->oi_owner,
+ };
+
+ return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrmap_walk_cowblocks(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec,
+ void *priv)
+{
+ struct xrgb_bitmap *bitmap = priv;
+
+ if (!xfs_refcount_check_domain(irec) ||
+ irec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ return xrgb_bitmap_set(bitmap, irec->rc_startblock,
+ irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rtrmap_find_refcount_rmaps(
+ struct xrep_rtrmap *rr)
+{
+ struct xrgb_bitmap cow_blocks; /* COWBIT */
+ struct xfs_refcount_irec low = {
+ .rc_startblock = 0,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_refcount_irec high = {
+ .rc_startblock = -1U,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ if (!xfs_has_rtreflink(sc->mp))
+ return 0;
+
+ xrgb_bitmap_init(&cow_blocks);
+
+ /* Collect rmaps for CoW staging extents. */
+ error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high,
+ xrep_rtrmap_walk_cowblocks, &cow_blocks);
+ if (error)
+ goto out_bitmap;
+
+ /* Generate rmaps for everything. */
+ error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xrgb_bitmap_destroy(&cow_blocks);
+ return error;
+}
+
+/* Count and check all collected records. */
+STATIC int
+xrep_rtrmap_check_record(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rtrmap *rr = priv;
+ int error;
+
+ error = xrep_rtrmap_check_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ rr->nr_records++;
+ return 0;
+}
+
+/* Generate all the reverse-mappings for the realtime device. */
+STATIC int
+xrep_rtrmap_find_rmaps(
+ struct xrep_rtrmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = NULL;
+ struct xfs_inode *ip;
+ struct xfs_btree_cur *mcur;
+ int error;
+
+ /* Generate rmaps for the realtime superblock */
+ if (xfs_has_rtsb(sc->mp) && rtg_rgno(rr->sc->sr.rtg) == 0) {
+ error = xrep_rtrmap_stash(rr, 0, sc->mp->m_sb.sb_rextsize,
+ XFS_RMAP_OWN_FS, 0, 0);
+ if (error)
+ return error;
+ }
+
+ /* Find CoW staging extents. */
+ xrep_rtgroup_btcur_init(sc, &sc->sr);
+ error = xrep_rtrmap_find_refcount_rmaps(rr);
+ xchk_rtgroup_btcur_free(&sc->sr);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Unlock the realtime metadata inodes and cancel the transaction to
+ * release the log grant space while we scan the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ xchk_rtgroup_unlock(&sc->sr);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
+ error = xrep_rtrmap_scan_inode(rr, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rr->iscan);
+ if (error)
+ return error;
+
+ /*
+ * Switch out for a real transaction and lock the RT metadata in
+ * preparation for building a new tree.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_setup_rt(sc);
+ if (error)
+ return error;
+ error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+ if (error)
+ return error;
+
+ /*
+ * If a hook failed to update the in-memory btree, we lack the data to
+ * continue the repair.
+ */
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ /* Scan for old rtrmap blocks. */
+ while ((pag = xfs_perag_next(sc->mp, pag))) {
+ error = xrep_rtrmap_scan_ag(rr, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ /*
+ * Now that we have everything locked again, we need to count the
+ * number of rmap records stashed in the btree. This should reflect
+ * all actively-owned rt files in the filesystem. At the same time,
+ * check all our records before we start building a new btree, which
+ * requires the rtbitmap lock.
+ */
+ mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, NULL, &rr->rtrmap_btree);
+ rr->nr_records = 0;
+ error = xfs_rmap_query_all(mcur, xrep_rtrmap_check_record, rr);
+ xfs_btree_del_cursor(mcur, error);
+
+ return error;
+}
+
+/* Building the new rtrmap btree. */
+
+/* Retrieve rtrmapbt data for bulk load. */
+STATIC int
+xrep_rtrmap_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xrep_rtrmap *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ int stat = 0;
+
+ error = xfs_btree_increment(rr->mcur, 0, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrmap_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_rtrmap *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrmap_iroot_size(
+ struct xfs_btree_cur *cur,
+ unsigned int level,
+ unsigned int nr_this_level,
+ void *priv)
+{
+ return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level);
+}
+
+/*
+ * Use the collected rmap information to stage a new rmap btree. If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed. This implements section (III)
+ * above.
+ */
+STATIC int
+xrep_rtrmap_build_new_tree(
+ struct xrep_rtrmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_btree_cur *rmap_cur;
+ int error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the realtime rmapbt inode.
+ */
+ error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
+ if (error)
+ return error;
+
+ rr->new_btree.bload.get_records = xrep_rtrmap_get_records;
+ rr->new_btree.bload.claim_block = xrep_rtrmap_claim_block;
+ rr->new_btree.bload.iroot_size = xrep_rtrmap_iroot_size;
+
+ rmap_cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
+ xfs_btree_stage_ifakeroot(rmap_cur, &rr->new_btree.ifake);
+
+ /* Compute how many blocks we'll need for the rmaps collected. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records);
+ if (error)
+ goto err_cur;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /*
+ * Guess how many blocks we're going to need to rebuild an entire
+ * rtrmapbt from the number of extents we found, and pump up our
+ * transaction to have sufficient block reservation. We're allowed
+ * to exceed quota to repair inconsistent metadata, though this is
+ * unlikely.
+ */
+ error = xfs_trans_reserve_more_inode(sc->tp, rtg_rmap(rtg),
+ rr->new_btree.bload.nr_blocks, 0, true);
+ if (error)
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ rr->new_btree.bload.nr_blocks);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Create a cursor to the in-memory btree so that we can bulk load the
+ * new btree.
+ */
+ rr->mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, NULL, &rr->rtrmap_btree);
+ error = xfs_btree_goto_left_edge(rr->mcur);
+ if (error)
+ goto err_mcur;
+
+ /* Add all observed rmap records. */
+ rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE;
+ error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_mcur;
+
+ /*
+ * Install the new rtrmap btree in the inode. After this point the old
+ * btree is no longer accessible, the new tree is live, and we can
+ * delete the cursor.
+ */
+ xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp);
+ xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
+ xfs_btree_del_cursor(rmap_cur, 0);
+ xfs_btree_del_cursor(rr->mcur, 0);
+ rr->mcur = NULL;
+
+ /*
+ * Now that we've written the new btree to disk, we don't need to keep
+ * updating the in-memory btree. Abort the scan to stop live updates.
+ */
+ xchk_iscan_abort(&rr->iscan);
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_trans(sc);
+
+err_mcur:
+ xfs_btree_del_cursor(rr->mcur, error);
+err_cur:
+ xfs_btree_del_cursor(rmap_cur, error);
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/* Reaping the old btree. */
+
+/* Reap the old rtrmapbt blocks. */
+STATIC int
+xrep_rtrmap_remove_old_tree(
+ struct xrep_rtrmap *rr)
+{
+ int error;
+
+ /*
+ * Free all the extents that were allocated to the former rtrmapbt and
+ * aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
+ if (error)
+ return error;
+
+ /*
+ * Ensure the proper reservation for the rtrmap inode so that we don't
+ * fail to expand the new btree.
+ */
+ return xrep_reset_metafile_resv(rr->sc);
+}
+
+static inline bool
+xrep_rtrmapbt_want_live_update(
+ struct xchk_iscan *iscan,
+ const struct xfs_owner_info *oi)
+{
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ /*
+ * We scanned the CoW staging extents before we started the iscan, so
+ * we need all the updates.
+ */
+ if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
+ return true;
+
+ /* Ignore updates to files that the scanner hasn't visited yet. */
+ return xchk_iscan_want_live_update(iscan, oi->oi_owner);
+}
+
+/*
+ * Apply a rtrmapbt update from the regular filesystem into our shadow btree.
+ * We're running from the thread that owns the rtrmap ILOCK and is generating
+ * the update, so we must be careful about which parts of the struct
+ * xrep_rtrmap that we change.
+ */
+static int
+xrep_rtrmapbt_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_rmap_update_params *p = data;
+ struct xrep_rtrmap *rr;
+ struct xfs_mount *mp;
+ struct xfs_btree_cur *mcur;
+ struct xfs_trans *tp;
+ void *txcookie;
+ int error;
+
+ rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb);
+ mp = rr->sc->mp;
+
+ if (!xrep_rtrmapbt_want_live_update(&rr->iscan, &p->oinfo))
+ goto out_unlock;
+
+ trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p);
+
+ error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
+ if (error)
+ goto out_abort;
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree);
+ error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
+ p->blockcount, &p->oinfo, p->unwritten);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rtrmap_btree, tp);
+ if (error)
+ goto out_cancel;
+
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ mutex_unlock(&rr->lock);
+ return NOTIFY_DONE;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rtrmap_btree, tp);
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+out_abort:
+ xchk_iscan_abort(&rr->iscan);
+ mutex_unlock(&rr->lock);
+out_unlock:
+ return NOTIFY_DONE;
+}
+
+/* Set up the filesystem scan components. */
+STATIC int
+xrep_rtrmap_setup_scan(
+ struct xrep_rtrmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ mutex_init(&rr->lock);
+ xfsb_bitmap_init(&rr->old_rtrmapbt_blocks);
+
+ /* Set up some storage */
+ error = xfs_rtrmapbt_mem_init(sc->mp, &rr->rtrmap_btree, sc->xmbtp,
+ rtg_rgno(sc->sr.rtg));
+ if (error)
+ goto out_bitmap;
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+
+ /*
+ * Hook into live rtrmap operations so that we can update our in-memory
+ * btree to reflect live changes on the filesystem. Since we drop the
+ * rtrmap ILOCK to scan all the inodes, we need this piece to avoid
+ * installing a stale btree.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_RMAP);
+ xfs_rmap_hook_setup(&rr->rhook, xrep_rtrmapbt_live_update);
+ error = xfs_rmap_hook_add(rtg_group(sc->sr.rtg), &rr->rhook);
+ if (error)
+ goto out_iscan;
+ return 0;
+
+out_iscan:
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rtrmap_btree);
+out_bitmap:
+ xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+ mutex_destroy(&rr->lock);
+ return error;
+}
+
+/* Tear down scan components. */
+STATIC void
+xrep_rtrmap_teardown(
+ struct xrep_rtrmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ xchk_iscan_abort(&rr->iscan);
+ xfs_rmap_hook_del(rtg_group(sc->sr.rtg), &rr->rhook);
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rtrmap_btree);
+ xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+ mutex_destroy(&rr->lock);
+}
+
+/* Repair the realtime rmap btree. */
+int
+xrep_rtrmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rtrmap *rr = sc->buf;
+ int error;
+
+ /* Make sure any problems with the fork are fixed. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ error = xrep_rtrmap_setup_scan(rr);
+ if (error)
+ return error;
+
+ /* Collect rmaps for realtime files. */
+ error = xrep_rtrmap_find_rmaps(rr);
+ if (error)
+ goto out_records;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Rebuild the rtrmap information. */
+ error = xrep_rtrmap_build_new_tree(rr);
+ if (error)
+ goto out_records;
+
+ /* Kill the old tree. */
+ error = xrep_rtrmap_remove_old_tree(rr);
+ if (error)
+ goto out_records;
+
+out_records:
+ xrep_rtrmap_teardown(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 49fc6250bafc..4ac679c1bd29 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -81,8 +81,7 @@ xchk_setup_rtsummary(
if (error)
return error;
- error = xchk_install_live_inode(sc,
- sc->sr.rtg->rtg_inodes[XFS_RTGI_SUMMARY]);
+ error = xchk_install_live_inode(sc, rtg_summary(sc->sr.rtg));
if (error)
return error;
@@ -90,6 +89,10 @@ xchk_setup_rtsummary(
if (error)
return error;
+ error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP);
+ if (error)
+ return error;
+
/*
* Now that we've locked the rtbitmap and rtsummary, we can't race with
* growfsrt trying to expand the summary or change the size of the rt
@@ -100,7 +103,6 @@ xchk_setup_rtsummary(
* exclusively here. If we ever start caring about running concurrent
* fsmap with scrub this could be changed.
*/
- xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
if (mp->m_sb.sb_rblocks) {
rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
rts->rbmblocks = xfs_rtbitmap_blockcount(mp);
@@ -191,8 +193,7 @@ xchk_rtsum_record_free(
rtlen = xfs_rtxlen_to_extlen(mp, rec->ar_extcount);
if (!xfs_verify_rtbext(mp, rtbno, rtlen)) {
- xchk_ino_xref_set_corrupt(sc,
- rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_ino);
+ xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
return -EFSCORRUPTED;
}
@@ -218,7 +219,7 @@ xchk_rtsum_compute(
/* If the bitmap size doesn't match the computed size, bail. */
if (XFS_FSB_TO_B(mp, xfs_rtbitmap_blockcount(mp)) !=
- rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_disk_size)
+ rtg_bitmap(rtg)->i_disk_size)
return -EFSCORRUPTED;
return xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtsum_record_free, sc);
@@ -310,8 +311,8 @@ xchk_rtsummary(
{
struct xfs_mount *mp = sc->mp;
struct xfs_rtgroup *rtg = sc->sr.rtg;
- struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
- struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
+ struct xfs_inode *rsumip = rtg_summary(rtg);
struct xchk_rtsummary *rts = sc->buf;
int error;
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
index 8198ea84ad70..d593977d70df 100644
--- a/fs/xfs/scrub/rtsummary_repair.c
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -165,7 +165,8 @@ xrep_rtsummary(
* Now exchange the contents. Nothing in repair uses the temporary
* buffer, so we can reuse it for the tempfile exchrange information.
*/
- error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, &rts->tempexch);
+ error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0,
+ rts->rsumblocks, &rts->tempexch);
if (error)
return error;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 950f5a58dcd9..6fa9e3e5bab7 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -149,6 +149,18 @@ xchk_probe(
if (xchk_should_terminate(sc, &error))
return error;
+ /*
+ * If the caller is probing to see if repair works but repair isn't
+ * built into the kernel, return EOPNOTSUPP because that's the signal
+ * that userspace expects. If online repair is built in, set the
+ * CORRUPT flag (without any of the usual tracing/logging) to force us
+ * into xrep_probe.
+ */
+ if (xchk_could_repair(sc)) {
+ if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR))
+ return -EOPNOTSUPP;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ }
return 0;
}
@@ -164,7 +176,7 @@ xchk_fsgates_disable(
trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
if (sc->flags & XCHK_FSGATES_DRAIN)
- xfs_drain_wait_disable();
+ xfs_defer_drain_wait_disable();
if (sc->flags & XCHK_FSGATES_QUOTA)
xfs_dqtrx_hook_disable();
@@ -218,6 +230,8 @@ xchk_teardown(
int error)
{
xchk_ag_free(sc, &sc->sa);
+ xchk_rtgroup_btcur_free(&sc->sr);
+
if (sc->tp) {
if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
error = xfs_trans_commit(sc->tp);
@@ -458,6 +472,20 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.has = xfs_has_rtsb,
.repair = xrep_rgsuperblock,
},
+ [XFS_SCRUB_TYPE_RTRMAPBT] = { /* realtime group rmapbt */
+ .type = ST_RTGROUP,
+ .setup = xchk_setup_rtrmapbt,
+ .scrub = xchk_rtrmapbt,
+ .has = xfs_has_rtrmapbt,
+ .repair = xrep_rtrmapbt,
+ },
+ [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */
+ .type = ST_RTGROUP,
+ .setup = xchk_setup_rtrefcountbt,
+ .scrub = xchk_rtrefcountbt,
+ .has = xfs_has_rtreflink,
+ .repair = xrep_rtrefcountbt,
+ },
};
static int
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 5dbbe93cb49b..a3f1abc91390 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -96,7 +96,7 @@ struct xchk_meta_ops {
int (*repair_eval)(struct xfs_scrub *sc);
/* Decide if we even have this piece of metadata. */
- bool (*has)(struct xfs_mount *);
+ bool (*has)(const struct xfs_mount *);
/* type describing required/allowed inputs */
enum xchk_type type;
@@ -126,6 +126,10 @@ struct xchk_rt {
/* XFS_RTGLOCK_* lock state if locked */
unsigned int rtlock_flags;
+
+ /* rtgroup btrees */
+ struct xfs_btree_cur *rmap_cur;
+ struct xfs_btree_cur *refc_cur;
};
struct xfs_scrub {
@@ -280,10 +284,14 @@ int xchk_metapath(struct xfs_scrub *sc);
int xchk_rtbitmap(struct xfs_scrub *sc);
int xchk_rtsummary(struct xfs_scrub *sc);
int xchk_rgsuperblock(struct xfs_scrub *sc);
+int xchk_rtrmapbt(struct xfs_scrub *sc);
+int xchk_rtrefcountbt(struct xfs_scrub *sc);
#else
# define xchk_rtbitmap xchk_nothing
# define xchk_rtsummary xchk_nothing
# define xchk_rgsuperblock xchk_nothing
+# define xchk_rtrmapbt xchk_nothing
+# define xchk_rtrefcountbt xchk_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_quota(struct xfs_scrub *sc);
@@ -317,8 +325,26 @@ void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
#ifdef CONFIG_XFS_RT
void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
xfs_extlen_t len);
+void xchk_xref_has_no_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo);
+void xchk_xref_is_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_is_not_rt_shared(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_is_not_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
#else
# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
+# define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0)
+# define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0)
+# define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0)
+# define xchk_xref_is_rt_cow_staging(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_shared(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_cow_staging(sc, bno, len) do { } while (0)
#endif
#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index a476c7b2ab75..f8a37ea97791 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -82,6 +82,8 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_DIRTREE] = "dirtree",
[XFS_SCRUB_TYPE_METAPATH] = "metapath",
[XFS_SCRUB_TYPE_RGSUPER] = "rgsuper",
+ [XFS_SCRUB_TYPE_RTRMAPBT] = "rtrmapbt",
+ [XFS_SCRUB_TYPE_RTREFCBT] = "rtrefcountbt",
};
/* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h
index 995ba187c5aa..eccda720c2ca 100644
--- a/fs/xfs/scrub/tempexch.h
+++ b/fs/xfs/scrub/tempexch.h
@@ -12,7 +12,7 @@ struct xrep_tempexch {
};
int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork,
- struct xrep_tempexch *ti);
+ xfs_fileoff_t off, xfs_filblks_t len, struct xrep_tempexch *ti);
int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork,
struct xrep_tempexch *ti);
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 2d7ca7e1bbca..cf99e0ca51b0 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -606,6 +606,8 @@ STATIC int
xrep_tempexch_prep_request(
struct xfs_scrub *sc,
int whichfork,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
struct xrep_tempexch *tx)
{
struct xfs_exchmaps_req *req = &tx->req;
@@ -629,18 +631,19 @@ xrep_tempexch_prep_request(
/* Exchange all mappings in both forks. */
req->ip1 = sc->tempip;
req->ip2 = sc->ip;
- req->startoff1 = 0;
- req->startoff2 = 0;
+ req->startoff1 = off;
+ req->startoff2 = off;
switch (whichfork) {
case XFS_ATTR_FORK:
req->flags |= XFS_EXCHMAPS_ATTR_FORK;
break;
case XFS_DATA_FORK:
- /* Always exchange sizes when exchanging data fork mappings. */
- req->flags |= XFS_EXCHMAPS_SET_SIZES;
+ /* Exchange sizes when exchanging all data fork mappings. */
+ if (off == 0 && len == XFS_MAX_FILEOFF)
+ req->flags |= XFS_EXCHMAPS_SET_SIZES;
break;
}
- req->blockcount = XFS_MAX_FILEOFF;
+ req->blockcount = len;
return 0;
}
@@ -749,6 +752,7 @@ xrep_tempexch_reserve_quota(
* or the two inodes have the same dquots.
*/
if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
+ xfs_is_metadir_inode(req->ip1) ||
(req->ip1->i_udquot == req->ip2->i_udquot &&
req->ip1->i_gdquot == req->ip2->i_gdquot &&
req->ip1->i_pdquot == req->ip2->i_pdquot))
@@ -795,6 +799,8 @@ int
xrep_tempexch_trans_reserve(
struct xfs_scrub *sc,
int whichfork,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
struct xrep_tempexch *tx)
{
int error;
@@ -803,7 +809,7 @@ xrep_tempexch_trans_reserve(
xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
- error = xrep_tempexch_prep_request(sc, whichfork, tx);
+ error = xrep_tempexch_prep_request(sc, whichfork, off, len, tx);
if (error)
return error;
@@ -841,7 +847,8 @@ xrep_tempexch_trans_alloc(
ASSERT(sc->tp == NULL);
ASSERT(xfs_has_exchange_range(sc->mp));
- error = xrep_tempexch_prep_request(sc, whichfork, tx);
+ error = xrep_tempexch_prep_request(sc, whichfork, 0, XFS_MAX_FILEOFF,
+ tx);
if (error)
return error;
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 98f923ae664d..2450e214103f 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -21,6 +21,7 @@
#include "xfs_rmap.h"
#include "xfs_parent.h"
#include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index d2ae7e93acb0..d7c4ced47c15 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -17,6 +17,7 @@
#include "xfs_bit.h"
#include "xfs_quota_defs.h"
+struct xfs_rtgroup;
struct xfs_scrub;
struct xfile;
struct xfarray;
@@ -40,6 +41,9 @@ struct xchk_dirtree_outcomes;
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
+TRACE_DEFINE_ENUM(XG_TYPE_AG);
+TRACE_DEFINE_ENUM(XG_TYPE_RTG);
+
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF);
@@ -72,6 +76,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -105,7 +111,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
{ XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \
{ XFS_SCRUB_TYPE_BARRIER, "barrier" }, \
{ XFS_SCRUB_TYPE_METAPATH, "metapath" }, \
- { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }
+ { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }, \
+ { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \
+ { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
@@ -1956,32 +1964,36 @@ DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup);
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
DECLARE_EVENT_CLASS(xrep_extent_class,
- TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
xfs_extlen_t len),
- TP_ARGS(pag, agbno, len),
+ TP_ARGS(xg, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
- __entry->dev = pag_mount(pag)->m_super->s_dev;
- __entry->agno = pag_agno(pag);
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agbno,
__entry->len)
);
#define DEFINE_REPAIR_EXTENT_EVENT(name) \
DEFINE_EVENT(xrep_extent_class, name, \
- TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
xfs_extlen_t len), \
- TP_ARGS(pag, agbno, len))
+ TP_ARGS(xg, agbno, len))
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
@@ -1989,35 +2001,39 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
DECLARE_EVENT_CLASS(xrep_reap_find_class,
- TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
xfs_extlen_t len, bool crosslinked),
- TP_ARGS(pag, agbno, len, crosslinked),
+ TP_ARGS(xg, agbno, len, crosslinked),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
__field(bool, crosslinked)
),
TP_fast_assign(
- __entry->dev = pag_mount(pag)->m_super->s_dev;
- __entry->agno = pag_agno(pag);
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
__entry->crosslinked = crosslinked;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d",
+ TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x crosslinked %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agbno,
__entry->len,
__entry->crosslinked ? 1 : 0)
);
#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \
DEFINE_EVENT(xrep_reap_find_class, name, \
- TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
xfs_extlen_t len, bool crosslinked), \
- TP_ARGS(pag, agbno, len, crosslinked))
+ TP_ARGS(xg, agbno, len, crosslinked))
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select);
@@ -2108,29 +2124,33 @@ TRACE_EVENT(xrep_ibt_found,
)
TRACE_EVENT(xrep_refc_found,
- TP_PROTO(const struct xfs_perag *pag,
+ TP_PROTO(const struct xfs_group *xg,
const struct xfs_refcount_irec *rec),
- TP_ARGS(pag, rec),
+ TP_ARGS(xg, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, domain)
+ __field(enum xfs_group_type, type)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
),
TP_fast_assign(
- __entry->dev = pag_mount(pag)->m_super->s_dev;
- __entry->agno = pag_agno(pag);
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->agno = xg->xg_gno;
+ __entry->type = xg->xg_type;
__entry->domain = rec->rc_domain;
__entry->startblock = rec->rc_startblock;
__entry->blockcount = rec->rc_blockcount;
__entry->refcount = rec->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d %sno 0x%x dom %s %sbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount)
@@ -2282,6 +2302,32 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
__entry->rmapbt_sz,
__entry->refcbt_sz)
)
+
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xrep_calc_rtgroup_resblks_btsize,
+ TP_PROTO(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+ xfs_rgblock_t usedlen, xfs_rgblock_t rmapbt_sz),
+ TP_ARGS(mp, rgno, usedlen, rmapbt_sz),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, usedlen)
+ __field(xfs_rgblock_t, rmapbt_sz)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rgno = rgno;
+ __entry->usedlen = usedlen;
+ __entry->rmapbt_sz = rmapbt_sz;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x usedlen %u rmapbt %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->usedlen,
+ __entry->rmapbt_sz)
+);
+#endif /* CONFIG_XFS_RT */
+
TRACE_EVENT(xrep_reset_counters,
TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc),
TP_ARGS(mp, fsc),
@@ -2680,11 +2726,12 @@ DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode);
DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode);
TRACE_EVENT(xrep_rmap_live_update,
- TP_PROTO(const struct xfs_perag *pag, unsigned int op,
+ TP_PROTO(const struct xfs_group *xg, unsigned int op,
const struct xfs_rmap_update_params *p),
- TP_ARGS(pag, op, p),
+ TP_ARGS(xg, op, p),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(unsigned int, op)
__field(xfs_agblock_t, agbno)
@@ -2694,8 +2741,9 @@ TRACE_EVENT(xrep_rmap_live_update,
__field(unsigned int, flags)
),
TP_fast_assign(
- __entry->dev = pag_mount(pag)->m_super->s_dev;
- __entry->agno = pag_agno(pag);
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->op = op;
__entry->agbno = p->startblock;
__entry->len = p->blockcount;
@@ -2704,10 +2752,12 @@ TRACE_EVENT(xrep_rmap_live_update,
if (p->unwritten)
__entry->flags |= XFS_RMAP_UNWRITTEN;
),
- TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x op %d %sbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__entry->op,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agbno,
__entry->len,
__entry->owner,
@@ -3605,6 +3655,186 @@ DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink);
DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink);
DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link);
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xrep_rtbitmap_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, xfs_rtxnum_t end),
+ TP_ARGS(mp, start, end),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_rtxnum_t, start)
+ __field(xfs_rtxnum_t, end)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->start = start;
+ __entry->end = end;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d startrtx 0x%llx endrtx 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->start,
+ __entry->end)
+);
+#define DEFINE_REPAIR_RGBITMAP_EVENT(name) \
+DEFINE_EVENT(xrep_rtbitmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, \
+ xfs_rtxnum_t end), \
+ TP_ARGS(mp, start, end))
+DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free);
+DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free_bulk);
+
+TRACE_EVENT(xrep_rtbitmap_or,
+ TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff,
+ xfs_rtword_t mask, xfs_rtword_t word),
+ TP_ARGS(mp, wordoff, mask, word),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(unsigned long long, wordoff)
+ __field(unsigned int, mask)
+ __field(unsigned int, word)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->wordoff = wordoff;
+ __entry->mask = mask;
+ __entry->word = word;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx mask 0x%x word 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->wordoff,
+ __entry->mask,
+ __entry->word)
+);
+
+TRACE_EVENT(xrep_rtbitmap_load,
+ TP_PROTO(struct xfs_rtgroup *rtg, xfs_fileoff_t rbmoff,
+ xfs_rtxnum_t rtx, xfs_rtxnum_t len),
+ TP_ARGS(rtg, rbmoff, rtx, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_fileoff_t, rbmoff)
+ __field(xfs_rtxnum_t, rtx)
+ __field(xfs_rtxnum_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rtdev = rtg_mount(rtg)->m_rtdev_targp->bt_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->rbmoff = rbmoff;
+ __entry->rtx = rtx;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x rbmoff 0x%llx rtx 0x%llx rtxcount 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->rgno,
+ __entry->rbmoff,
+ __entry->rtx,
+ __entry->len)
+);
+
+TRACE_EVENT(xrep_rtbitmap_load_words,
+ TP_PROTO(struct xfs_mount *mp, xfs_fileoff_t rbmoff,
+ unsigned long long wordoff, unsigned int wordcnt),
+ TP_ARGS(mp, rbmoff, wordoff, wordcnt),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_fileoff_t, rbmoff)
+ __field(unsigned long long, wordoff)
+ __field(unsigned int, wordcnt)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->rbmoff = rbmoff;
+ __entry->wordoff = wordoff;
+ __entry->wordcnt = wordcnt;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d rbmoff 0x%llx wordoff 0x%llx wordcnt 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->rbmoff,
+ __entry->wordoff,
+ __entry->wordcnt)
+);
+
+TRACE_EVENT(xrep_rtbitmap_load_word,
+ TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff,
+ unsigned int bit, xfs_rtword_t ondisk_word,
+ xfs_rtword_t xfile_word, xfs_rtword_t word_mask),
+ TP_ARGS(mp, wordoff, bit, ondisk_word, xfile_word, word_mask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(unsigned long long, wordoff)
+ __field(unsigned int, bit)
+ __field(xfs_rtword_t, ondisk_word)
+ __field(xfs_rtword_t, xfile_word)
+ __field(xfs_rtword_t, word_mask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->wordoff = wordoff;
+ __entry->bit = bit;
+ __entry->ondisk_word = ondisk_word;
+ __entry->xfile_word = xfile_word;
+ __entry->word_mask = word_mask;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx bit %u ondisk 0x%x(0x%x) inmem 0x%x(0x%x) result 0x%x mask 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->wordoff,
+ __entry->bit,
+ __entry->ondisk_word,
+ __entry->ondisk_word & __entry->word_mask,
+ __entry->xfile_word,
+ __entry->xfile_word & ~__entry->word_mask,
+ (__entry->xfile_word & ~__entry->word_mask) |
+ (__entry->ondisk_word & __entry->word_mask),
+ __entry->word_mask)
+);
+
+TRACE_EVENT(xrep_rtrmap_found,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_rmap_irec *rec),
+ TP_ARGS(mp, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_rgblock_t, rgbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->rgbno = rec->rm_startblock;
+ __entry->len = rec->rm_blockcount;
+ __entry->owner = rec->rm_owner;
+ __entry->offset = rec->rm_offset;
+ __entry->flags = rec->rm_flags;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d rgbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->rgbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+#endif /* CONFIG_XFS_RT */
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 559a3a577097..5077d52a775d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -19,6 +19,7 @@
#include "xfs_reflink.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_icache.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -114,7 +115,7 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED) {
+ if (ioend->io_flags & IOMAP_IOEND_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size);
@@ -125,13 +126,13 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_type == IOMAP_UNWRITTEN)
+ else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
if (!error && xfs_ioend_is_append(ioend))
- error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+ error = xfs_setfilesize(ip, offset, size);
done:
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
@@ -395,10 +396,11 @@ allocate_blocks:
}
static int
-xfs_prepare_ioend(
- struct iomap_ioend *ioend,
+xfs_submit_ioend(
+ struct iomap_writepage_ctx *wpc,
int status)
{
+ struct iomap_ioend *ioend = wpc->ioend;
unsigned int nofs_flag;
/*
@@ -409,7 +411,7 @@ xfs_prepare_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
- if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
+ if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
@@ -417,10 +419,14 @@ xfs_prepare_ioend(
memalloc_nofs_restore(nofs_flag);
/* send ioends that might require a transaction to the completion wq */
- if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
- (ioend->io_flags & IOMAP_F_SHARED))
+ if (xfs_ioend_is_append(ioend) ||
+ (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
ioend->io_bio.bi_end_io = xfs_end_bio;
- return status;
+
+ if (status)
+ return status;
+ submit_bio(&ioend->io_bio);
+ return 0;
}
/*
@@ -462,7 +468,7 @@ xfs_discard_folio(
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
- .prepare_ioend = xfs_prepare_ioend,
+ .submit_ioend = xfs_submit_ioend,
.discard_folio = xfs_discard_folio,
};
@@ -528,12 +534,44 @@ xfs_vm_readahead(
}
static int
-xfs_iomap_swapfile_activate(
+xfs_vm_swap_activate(
struct swap_info_struct *sis,
struct file *swap_file,
sector_t *span)
{
- sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+ struct xfs_inode *ip = XFS_I(file_inode(swap_file));
+
+ /*
+ * Swap file activation can race against concurrent shared extent
+ * removal in files that have been cloned. If this happens,
+ * iomap_swapfile_iter() can fail because it encountered a shared
+ * extent even though an operation is in progress to remove those
+ * shared extents.
+ *
+ * This race becomes problematic when we defer extent removal
+ * operations beyond the end of a syscall (i.e. use async background
+ * processing algorithms). Users think the extents are no longer
+ * shared, but iomap_swapfile_iter() still sees them as shared
+ * because the refcountbt entries for the extents being removed have
+ * not yet been updated. Hence the swapon call fails unexpectedly.
+ *
+ * The race condition is currently most obvious from the unlink()
+ * operation as extent removal is deferred until after the last
+ * reference to the inode goes away. We then process the extent
+ * removal asynchronously, hence triggers the "syscall completed but
+ * work not done" condition mentioned above. To close this race
+ * window, we need to flush any pending inodegc operations to ensure
+ * they have updated the refcountbt records before we try to map the
+ * swapfile.
+ */
+ xfs_inodegc_flush(ip->i_mount);
+
+ /*
+ * Direct the swap code to the correct block device when this file
+ * sits on the RT device.
+ */
+ sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
+
return iomap_swapfile_activate(sis, swap_file, span,
&xfs_read_iomap_ops);
}
@@ -549,11 +587,11 @@ const struct address_space_operations xfs_address_space_operations = {
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.dirty_folio = noop_dirty_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 24fb12986a56..319004bf089f 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -305,11 +305,6 @@ xfs_attr3_root_inactive(
XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp);
if (error)
return error;
- error = bp->b_error;
- if (error) {
- xfs_trans_brelse(*trans, bp);
- return error;
- }
xfs_trans_binval(*trans, bp); /* remove from cache */
/*
* Commit the invalidate and start the next transaction.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aa63b8efd782..5d560e9073f4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -22,17 +22,13 @@
#include "xfs_error.h"
#include "xfs_ag.h"
#include "xfs_buf_mem.h"
+#include "xfs_notify_failure.h"
struct kmem_cache *xfs_buf_cache;
/*
* Locking orders
*
- * xfs_buf_ioacct_inc:
- * xfs_buf_ioacct_dec:
- * b_sema (caller holds)
- * b_lock
- *
* xfs_buf_stale:
* b_sema (caller holds)
* b_lock
@@ -40,8 +36,7 @@ struct kmem_cache *xfs_buf_cache;
*
* xfs_buf_rele:
* b_lock
- * pag_buf_lock
- * lru_lock
+ * lru_lock
*
* xfs_buftarg_drain_rele
* lru_lock
@@ -52,14 +47,8 @@ struct kmem_cache *xfs_buf_cache;
* b_lock (trylock due to inversion)
*/
-static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
-
-static inline int
-xfs_buf_submit(
- struct xfs_buf *bp)
-{
- return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
-}
+static void xfs_buf_submit(struct xfs_buf *bp);
+static int xfs_buf_iowait(struct xfs_buf *bp);
static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
{
@@ -88,60 +77,6 @@ xfs_buf_vmap_len(
}
/*
- * Bump the I/O in flight count on the buftarg if we haven't yet done so for
- * this buffer. The count is incremented once per buffer (per hold cycle)
- * because the corresponding decrement is deferred to buffer release. Buffers
- * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
- * tracking adds unnecessary overhead. This is used for sychronization purposes
- * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
- * in-flight buffers.
- *
- * Buffers that are never released (e.g., superblock, iclog buffers) must set
- * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
- * never reaches zero and unmount hangs indefinitely.
- */
-static inline void
-xfs_buf_ioacct_inc(
- struct xfs_buf *bp)
-{
- if (bp->b_flags & XBF_NO_IOACCT)
- return;
-
- ASSERT(bp->b_flags & XBF_ASYNC);
- spin_lock(&bp->b_lock);
- if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
- bp->b_state |= XFS_BSTATE_IN_FLIGHT;
- percpu_counter_inc(&bp->b_target->bt_io_count);
- }
- spin_unlock(&bp->b_lock);
-}
-
-/*
- * Clear the in-flight state on a buffer about to be released to the LRU or
- * freed and unaccount from the buftarg.
- */
-static inline void
-__xfs_buf_ioacct_dec(
- struct xfs_buf *bp)
-{
- lockdep_assert_held(&bp->b_lock);
-
- if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
- bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
- percpu_counter_dec(&bp->b_target->bt_io_count);
- }
-}
-
-static inline void
-xfs_buf_ioacct_dec(
- struct xfs_buf *bp)
-{
- spin_lock(&bp->b_lock);
- __xfs_buf_ioacct_dec(bp);
- spin_unlock(&bp->b_lock);
-}
-
-/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
* reference count falls to zero. If the buffer is already on the LRU, we need
@@ -164,21 +99,13 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
- /*
- * Once the buffer is marked stale and unlocked, a subsequent lookup
- * could reset b_flags. There is no guarantee that the buffer is
- * unaccounted (released to LRU) before that occurs. Drop in-flight
- * status now to preserve accounting consistency.
- */
spin_lock(&bp->b_lock);
- __xfs_buf_ioacct_dec(bp);
-
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
(list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
- atomic_dec(&bp->b_hold);
+ bp->b_hold--;
- ASSERT(atomic_read(&bp->b_hold) >= 1);
+ ASSERT(bp->b_hold >= 1);
spin_unlock(&bp->b_lock);
}
@@ -202,9 +129,6 @@ xfs_buf_get_maps(
return 0;
}
-/*
- * Frees b_pages if it was allocated.
- */
static void
xfs_buf_free_maps(
struct xfs_buf *bp)
@@ -237,23 +161,25 @@ _xfs_buf_alloc(
*/
flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
- atomic_set(&bp->b_hold, 1);
+ /*
+ * A new buffer is held and locked by the owner. This ensures that the
+ * buffer is owned by the caller and racing RCU lookups right after
+ * inserting into the hash table are safe (and will have to wait for
+ * the unlock to do anything non-trivial).
+ */
+ bp->b_hold = 1;
+ sema_init(&bp->b_sema, 0); /* held, no waiters */
+
+ spin_lock_init(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list);
INIT_LIST_HEAD(&bp->b_li_list);
- sema_init(&bp->b_sema, 0); /* held, no waiters */
- spin_lock_init(&bp->b_lock);
bp->b_target = target;
bp->b_mount = target->bt_mount;
bp->b_flags = flags;
- /*
- * Set length and io_length to the same value initially.
- * I/O routines should use io_length, which will be the same in
- * most cases but may be reset (e.g. XFS recovery).
- */
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
kmem_cache_free(xfs_buf_cache, bp);
@@ -395,8 +321,8 @@ xfs_buf_alloc_pages(
for (;;) {
long last = filled;
- filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
- bp->b_pages);
+ filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
+ bp->b_pages);
if (filled == bp->b_page_count) {
XFS_STATS_INC(bp->b_mount, xb_page_found);
break;
@@ -519,7 +445,6 @@ int
xfs_buf_cache_init(
struct xfs_buf_cache *bch)
{
- spin_lock_init(&bch->bc_lock);
return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
}
@@ -588,6 +513,20 @@ xfs_buf_find_lock(
return 0;
}
+static bool
+xfs_buf_try_hold(
+ struct xfs_buf *bp)
+{
+ spin_lock(&bp->b_lock);
+ if (bp->b_hold == 0) {
+ spin_unlock(&bp->b_lock);
+ return false;
+ }
+ bp->b_hold++;
+ spin_unlock(&bp->b_lock);
+ return true;
+}
+
static inline int
xfs_buf_lookup(
struct xfs_buf_cache *bch,
@@ -600,7 +539,7 @@ xfs_buf_lookup(
rcu_read_lock();
bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
- if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+ if (!bp || !xfs_buf_try_hold(bp)) {
rcu_read_unlock();
return -ENOENT;
}
@@ -655,18 +594,20 @@ xfs_buf_find_insert(
if (error)
goto out_free_buf;
- spin_lock(&bch->bc_lock);
+ /* The new buffer keeps the perag reference until it is freed. */
+ new_bp->b_pag = pag;
+
+ rcu_read_lock();
bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
+ rcu_read_unlock();
error = PTR_ERR(bp);
- spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
- if (bp) {
+ if (bp && xfs_buf_try_hold(bp)) {
/* found an existing buffer */
- atomic_inc(&bp->b_hold);
- spin_unlock(&bch->bc_lock);
+ rcu_read_unlock();
error = xfs_buf_find_lock(bp, flags);
if (error)
xfs_buf_rele(bp);
@@ -674,10 +615,8 @@ xfs_buf_find_insert(
*bpp = bp;
goto out_free_buf;
}
+ rcu_read_unlock();
- /* The new buffer keeps the perag reference until it is freed. */
- new_bp->b_pag = pag;
- spin_unlock(&bch->bc_lock);
*bpp = new_bp;
return 0;
@@ -797,16 +736,14 @@ out_put_perag:
int
_xfs_buf_read(
- struct xfs_buf *bp,
- xfs_buf_flags_t flags)
+ struct xfs_buf *bp)
{
- ASSERT(!(flags & XBF_WRITE));
ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
- bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
-
- return xfs_buf_submit(bp);
+ bp->b_flags |= XBF_READ;
+ xfs_buf_submit(bp);
+ return xfs_buf_iowait(bp);
}
/*
@@ -857,6 +794,8 @@ xfs_buf_read_map(
struct xfs_buf *bp;
int error;
+ ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD)));
+
flags |= XBF_READ;
*bpp = NULL;
@@ -870,21 +809,11 @@ xfs_buf_read_map(
/* Initiate the buffer read and wait. */
XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
- error = _xfs_buf_read(bp, flags);
-
- /* Readahead iodone already dropped the buffer, so exit. */
- if (flags & XBF_ASYNC)
- return 0;
+ error = _xfs_buf_read(bp);
} else {
/* Buffer already read; all we need to do is check it. */
error = xfs_buf_reverify(bp, ops);
- /* Readahead already finished; drop the buffer and exit. */
- if (flags & XBF_ASYNC) {
- xfs_buf_relse(bp);
- return 0;
- }
-
/* We do not want read in the flags */
bp->b_flags &= ~XBF_READ;
ASSERT(bp->b_ops != NULL || ops == NULL);
@@ -936,6 +865,7 @@ xfs_buf_readahead_map(
int nmaps,
const struct xfs_buf_ops *ops)
{
+ const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD;
struct xfs_buf *bp;
/*
@@ -945,9 +875,21 @@ xfs_buf_readahead_map(
if (xfs_buftarg_is_mem(target))
return;
- xfs_buf_read_map(target, map, nmaps,
- XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
- __this_address);
+ if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp))
+ return;
+ trace_xfs_buf_readahead(bp, 0, _RET_IP_);
+
+ if (bp->b_flags & XBF_DONE) {
+ xfs_buf_reverify(bp, ops);
+ xfs_buf_relse(bp);
+ return;
+ }
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
+ bp->b_ops = ops;
+ bp->b_flags &= ~(XBF_WRITE | XBF_DONE);
+ bp->b_flags |= flags;
+ percpu_counter_inc(&target->bt_readahead_count);
+ xfs_buf_submit(bp);
}
/*
@@ -982,8 +924,8 @@ xfs_buf_read_uncached(
bp->b_ops = ops;
xfs_buf_submit(bp);
- if (bp->b_error) {
- error = bp->b_error;
+ error = xfs_buf_iowait(bp);
+ if (error) {
xfs_buf_relse(bp);
return error;
}
@@ -1003,10 +945,12 @@ xfs_buf_get_uncached(
struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
+ /* there are currently no valid flags for xfs_buf_get_uncached */
+ ASSERT(flags == 0);
+
*bpp = NULL;
- /* flags might contain irrelevant bits, pass only what we care about */
- error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
+ error = _xfs_buf_alloc(target, &map, 1, flags, &bp);
if (error)
return error;
@@ -1043,7 +987,10 @@ xfs_buf_hold(
struct xfs_buf *bp)
{
trace_xfs_buf_hold(bp, _RET_IP_);
- atomic_inc(&bp->b_hold);
+
+ spin_lock(&bp->b_lock);
+ bp->b_hold++;
+ spin_unlock(&bp->b_lock);
}
static void
@@ -1051,10 +998,14 @@ xfs_buf_rele_uncached(
struct xfs_buf *bp)
{
ASSERT(list_empty(&bp->b_lru));
- if (atomic_dec_and_test(&bp->b_hold)) {
- xfs_buf_ioacct_dec(bp);
- xfs_buf_free(bp);
+
+ spin_lock(&bp->b_lock);
+ if (--bp->b_hold) {
+ spin_unlock(&bp->b_lock);
+ return;
}
+ spin_unlock(&bp->b_lock);
+ xfs_buf_free(bp);
}
static void
@@ -1064,51 +1015,30 @@ xfs_buf_rele_cached(
struct xfs_buftarg *btp = bp->b_target;
struct xfs_perag *pag = bp->b_pag;
struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
- bool release;
bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_);
- ASSERT(atomic_read(&bp->b_hold) > 0);
-
- /*
- * We grab the b_lock here first to serialise racing xfs_buf_rele()
- * calls. The pag_buf_lock being taken on the last reference only
- * serialises against racing lookups in xfs_buf_find(). IOWs, the second
- * to last reference we drop here is not serialised against the last
- * reference until we take bp->b_lock. Hence if we don't grab b_lock
- * first, the last "release" reference can win the race to the lock and
- * free the buffer before the second-to-last reference is processed,
- * leading to a use-after-free scenario.
- */
spin_lock(&bp->b_lock);
- release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
- if (!release) {
- /*
- * Drop the in-flight state if the buffer is already on the LRU
- * and it holds the only reference. This is racy because we
- * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
- * ensures the decrement occurs only once per-buf.
- */
- if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
- __xfs_buf_ioacct_dec(bp);
+ ASSERT(bp->b_hold >= 1);
+ if (bp->b_hold > 1) {
+ bp->b_hold--;
goto out_unlock;
}
- /* the last reference has been dropped ... */
- __xfs_buf_ioacct_dec(bp);
- if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+ /* we are asked to drop the last reference */
+ if (atomic_read(&bp->b_lru_ref)) {
/*
- * If the buffer is added to the LRU take a new reference to the
+ * If the buffer is added to the LRU, keep the reference to the
* buffer for the LRU and clear the (now stale) dispose list
- * state flag
+ * state flag, else drop the reference.
*/
- if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
+ if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
bp->b_state &= ~XFS_BSTATE_DISPOSE;
- atomic_inc(&bp->b_hold);
- }
- spin_unlock(&bch->bc_lock);
+ else
+ bp->b_hold--;
} else {
+ bp->b_hold--;
/*
* most of the time buffers will already be removed from the
* LRU, so optimise that case by checking for the
@@ -1124,7 +1054,6 @@ xfs_buf_rele_cached(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
xfs_buf_hash_params);
- spin_unlock(&bch->bc_lock);
if (pag)
xfs_perag_put(pag);
freebuf = true;
@@ -1291,6 +1220,7 @@ xfs_buf_ioend_handle_error(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_error_cfg *cfg;
+ struct xfs_log_item *lip;
/*
* If we've already shutdown the journal because of I/O errors, there's
@@ -1338,12 +1268,11 @@ xfs_buf_ioend_handle_error(
}
/* Still considered a transient error. Caller will schedule retries. */
- if (bp->b_flags & _XBF_INODES)
- xfs_buf_inode_io_fail(bp);
- else if (bp->b_flags & _XBF_DQUOTS)
- xfs_buf_dquot_io_fail(bp);
- else
- ASSERT(list_empty(&bp->b_li_list));
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+ set_bit(XFS_LI_FAILED, &lip->li_flags);
+ clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
+ }
+
xfs_buf_ioerror(bp, 0);
xfs_buf_relse(bp);
return true;
@@ -1351,6 +1280,7 @@ xfs_buf_ioend_handle_error(
resubmit:
xfs_buf_ioerror(bp, 0);
bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+ reinit_completion(&bp->b_iowait);
xfs_buf_submit(bp);
return true;
out_stale:
@@ -1361,24 +1291,23 @@ out_stale:
return false;
}
-static void
-xfs_buf_ioend(
+/* returns false if the caller needs to resubmit the I/O, else true */
+static bool
+__xfs_buf_ioend(
struct xfs_buf *bp)
{
trace_xfs_buf_iodone(bp, _RET_IP_);
- /*
- * Pull in IO completion errors now. We are guaranteed to be running
- * single threaded, so we don't need the lock to read b_io_error.
- */
- if (!bp->b_error && bp->b_io_error)
- xfs_buf_ioerror(bp, bp->b_io_error);
-
if (bp->b_flags & XBF_READ) {
+ if (!bp->b_error && xfs_buf_is_vmapped(bp))
+ invalidate_kernel_vmap_range(bp->b_addr,
+ xfs_buf_vmap_len(bp));
if (!bp->b_error && bp->b_ops)
bp->b_ops->verify_read(bp);
if (!bp->b_error)
bp->b_flags |= XBF_DONE;
+ if (bp->b_flags & XBF_READ_AHEAD)
+ percpu_counter_dec(&bp->b_target->bt_readahead_count);
} else {
if (!bp->b_error) {
bp->b_flags &= ~XBF_WRITE_FAIL;
@@ -1386,7 +1315,7 @@ xfs_buf_ioend(
}
if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
- return;
+ return false;
/* clear the retry state */
bp->b_last_error = 0;
@@ -1401,16 +1330,21 @@ xfs_buf_ioend(
if (bp->b_log_item)
xfs_buf_item_done(bp);
- if (bp->b_flags & _XBF_INODES)
- xfs_buf_inode_iodone(bp);
- else if (bp->b_flags & _XBF_DQUOTS)
- xfs_buf_dquot_iodone(bp);
-
+ if (bp->b_iodone)
+ bp->b_iodone(bp);
}
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
_XBF_LOGRECOVERY);
+ return true;
+}
+static void
+xfs_buf_ioend(
+ struct xfs_buf *bp)
+{
+ if (!__xfs_buf_ioend(bp))
+ return;
if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
else
@@ -1424,15 +1358,8 @@ xfs_buf_ioend_work(
struct xfs_buf *bp =
container_of(work, struct xfs_buf, b_ioend_work);
- xfs_buf_ioend(bp);
-}
-
-static void
-xfs_buf_ioend_async(
- struct xfs_buf *bp)
-{
- INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
- queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+ if (__xfs_buf_ioend(bp))
+ xfs_buf_relse(bp);
}
void
@@ -1485,7 +1412,8 @@ xfs_bwrite(
bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
XBF_DONE);
- error = xfs_buf_submit(bp);
+ xfs_buf_submit(bp);
+ error = xfs_buf_iowait(bp);
if (error)
xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
return error;
@@ -1495,188 +1423,85 @@ static void
xfs_buf_bio_end_io(
struct bio *bio)
{
- struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
-
- if (!bio->bi_status &&
- (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
- XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
- bio->bi_status = BLK_STS_IOERR;
-
- /*
- * don't overwrite existing errors - otherwise we can lose errors on
- * buffers that require multiple bios to complete.
- */
- if (bio->bi_status) {
- int error = blk_status_to_errno(bio->bi_status);
-
- cmpxchg(&bp->b_io_error, 0, error);
- }
-
- if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
- invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
-
- if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
- xfs_buf_ioend_async(bp);
- bio_put(bio);
-}
-
-static void
-xfs_buf_ioapply_map(
- struct xfs_buf *bp,
- int map,
- int *buf_offset,
- int *count,
- blk_opf_t op)
-{
- int page_index;
- unsigned int total_nr_pages = bp->b_page_count;
- int nr_pages;
- struct bio *bio;
- sector_t sector = bp->b_maps[map].bm_bn;
- int size;
- int offset;
-
- /* skip the pages in the buffer before the start offset */
- page_index = 0;
- offset = *buf_offset;
- while (offset >= PAGE_SIZE) {
- page_index++;
- offset -= PAGE_SIZE;
- }
-
- /*
- * Limit the IO size to the length of the current vector, and update the
- * remaining IO count for the next time around.
- */
- size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
- *count -= size;
- *buf_offset += size;
-
-next_chunk:
- atomic_inc(&bp->b_io_remaining);
- nr_pages = bio_max_segs(total_nr_pages);
-
- bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
- bio->bi_iter.bi_sector = sector;
- bio->bi_end_io = xfs_buf_bio_end_io;
- bio->bi_private = bp;
-
- for (; size && nr_pages; nr_pages--, page_index++) {
- int rbytes, nbytes = PAGE_SIZE - offset;
-
- if (nbytes > size)
- nbytes = size;
-
- rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
- offset);
- if (rbytes < nbytes)
- break;
+ struct xfs_buf *bp = bio->bi_private;
- offset = 0;
- sector += BTOBB(nbytes);
- size -= nbytes;
- total_nr_pages--;
- }
+ if (bio->bi_status)
+ xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
+ else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
+ XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
+ xfs_buf_ioerror(bp, -EIO);
- if (likely(bio->bi_iter.bi_size)) {
- if (xfs_buf_is_vmapped(bp)) {
- flush_kernel_vmap_range(bp->b_addr,
- xfs_buf_vmap_len(bp));
- }
- submit_bio(bio);
- if (size)
- goto next_chunk;
+ if (bp->b_flags & XBF_ASYNC) {
+ INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
+ queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
} else {
- /*
- * This is guaranteed not to be the last io reference count
- * because the caller (xfs_buf_submit) holds a count itself.
- */
- atomic_dec(&bp->b_io_remaining);
- xfs_buf_ioerror(bp, -EIO);
- bio_put(bio);
+ complete(&bp->b_iowait);
}
+ bio_put(bio);
}
-STATIC void
-_xfs_buf_ioapply(
- struct xfs_buf *bp)
+static inline blk_opf_t
+xfs_buf_bio_op(
+ struct xfs_buf *bp)
{
- struct blk_plug plug;
- blk_opf_t op;
- int offset;
- int size;
- int i;
-
- /*
- * Make sure we capture only current IO errors rather than stale errors
- * left over from previous use of the buffer (e.g. failed readahead).
- */
- bp->b_error = 0;
+ blk_opf_t op;
if (bp->b_flags & XBF_WRITE) {
op = REQ_OP_WRITE;
-
- /*
- * Run the write verifier callback function if it exists. If
- * this function fails it will mark the buffer with an error and
- * the IO should not be dispatched.
- */
- if (bp->b_ops) {
- bp->b_ops->verify_write(bp);
- if (bp->b_error) {
- xfs_force_shutdown(bp->b_mount,
- SHUTDOWN_CORRUPT_INCORE);
- return;
- }
- } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
- struct xfs_mount *mp = bp->b_mount;
-
- /*
- * non-crc filesystems don't attach verifiers during
- * log recovery, so don't warn for such filesystems.
- */
- if (xfs_has_crc(mp)) {
- xfs_warn(mp,
- "%s: no buf ops on daddr 0x%llx len %d",
- __func__, xfs_buf_daddr(bp),
- bp->b_length);
- xfs_hex_dump(bp->b_addr,
- XFS_CORRUPTION_DUMP_LEN);
- dump_stack();
- }
- }
} else {
op = REQ_OP_READ;
if (bp->b_flags & XBF_READ_AHEAD)
op |= REQ_RAHEAD;
}
- /* we only use the buffer cache for meta-data */
- op |= REQ_META;
+ return op | REQ_META;
+}
- /* in-memory targets are directly mapped, no IO required. */
- if (xfs_buftarg_is_mem(bp->b_target)) {
- xfs_buf_ioend(bp);
- return;
+static void
+xfs_buf_submit_bio(
+ struct xfs_buf *bp)
+{
+ unsigned int size = BBTOB(bp->b_length);
+ unsigned int map = 0, p;
+ struct blk_plug plug;
+ struct bio *bio;
+
+ bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
+ xfs_buf_bio_op(bp), GFP_NOIO);
+ bio->bi_private = bp;
+ bio->bi_end_io = xfs_buf_bio_end_io;
+
+ if (bp->b_flags & _XBF_KMEM) {
+ __bio_add_page(bio, virt_to_page(bp->b_addr), size,
+ bp->b_offset);
+ } else {
+ for (p = 0; p < bp->b_page_count; p++)
+ __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
+ bio->bi_iter.bi_size = size; /* limit to the actual size used */
+
+ if (xfs_buf_is_vmapped(bp))
+ flush_kernel_vmap_range(bp->b_addr,
+ xfs_buf_vmap_len(bp));
}
/*
- * Walk all the vectors issuing IO on them. Set up the initial offset
- * into the buffer and the desired IO size before we start -
- * _xfs_buf_ioapply_vec() will modify them appropriately for each
- * subsequent call.
+ * If there is more than one map segment, split out a new bio for each
+ * map except of the last one. The last map is handled by the
+ * remainder of the original bio outside the loop.
*/
- offset = bp->b_offset;
- size = BBTOB(bp->b_length);
blk_start_plug(&plug);
- for (i = 0; i < bp->b_map_count; i++) {
- xfs_buf_ioapply_map(bp, i, &offset, &size, op);
- if (bp->b_error)
- break;
- if (size <= 0)
- break; /* all done */
+ for (map = 0; map < bp->b_map_count - 1; map++) {
+ struct bio *split;
+
+ split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS,
+ &fs_bio_set);
+ split->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
+ bio_chain(split, bio);
+ submit_bio(split);
}
+ bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
+ submit_bio(bio);
blk_finish_plug(&plug);
}
@@ -1689,26 +1514,55 @@ xfs_buf_iowait(
{
ASSERT(!(bp->b_flags & XBF_ASYNC));
- trace_xfs_buf_iowait(bp, _RET_IP_);
- wait_for_completion(&bp->b_iowait);
- trace_xfs_buf_iowait_done(bp, _RET_IP_);
+ do {
+ trace_xfs_buf_iowait(bp, _RET_IP_);
+ wait_for_completion(&bp->b_iowait);
+ trace_xfs_buf_iowait_done(bp, _RET_IP_);
+ } while (!__xfs_buf_ioend(bp));
return bp->b_error;
}
/*
+ * Run the write verifier callback function if it exists. If this fails, mark
+ * the buffer with an error and do not dispatch the I/O.
+ */
+static bool
+xfs_buf_verify_write(
+ struct xfs_buf *bp)
+{
+ if (bp->b_ops) {
+ bp->b_ops->verify_write(bp);
+ if (bp->b_error)
+ return false;
+ } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
+ /*
+ * Non-crc filesystems don't attach verifiers during log
+ * recovery, so don't warn for such filesystems.
+ */
+ if (xfs_has_crc(bp->b_mount)) {
+ xfs_warn(bp->b_mount,
+ "%s: no buf ops on daddr 0x%llx len %d",
+ __func__, xfs_buf_daddr(bp),
+ bp->b_length);
+ xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN);
+ dump_stack();
+ }
+ }
+
+ return true;
+}
+
+/*
* Buffer I/O submission path, read or write. Asynchronous submission transfers
* the buffer lock ownership and the current reference to the IO. It is not
* safe to reference the buffer after a call to this function unless the caller
* holds an additional reference itself.
*/
-static int
-__xfs_buf_submit(
- struct xfs_buf *bp,
- bool wait)
+static void
+xfs_buf_submit(
+ struct xfs_buf *bp)
{
- int error = 0;
-
trace_xfs_buf_submit(bp, _RET_IP_);
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
@@ -1728,57 +1582,33 @@ __xfs_buf_submit(
* state here rather than mount state to avoid corrupting the log tail
* on shutdown.
*/
- if (bp->b_mount->m_log &&
- xlog_is_shutdown(bp->b_mount->m_log)) {
+ if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) {
xfs_buf_ioend_fail(bp);
- return -EIO;
+ return;
}
- /*
- * Grab a reference so the buffer does not go away underneath us. For
- * async buffers, I/O completion drops the callers reference, which
- * could occur before submission returns.
- */
- xfs_buf_hold(bp);
-
if (bp->b_flags & XBF_WRITE)
xfs_buf_wait_unpin(bp);
- /* clear the internal error state to avoid spurious errors */
- bp->b_io_error = 0;
-
/*
- * Set the count to 1 initially, this will stop an I/O completion
- * callout which happens before we have started all the I/O from calling
- * xfs_buf_ioend too early.
+ * Make sure we capture only current IO errors rather than stale errors
+ * left over from previous use of the buffer (e.g. failed readahead).
*/
- atomic_set(&bp->b_io_remaining, 1);
- if (bp->b_flags & XBF_ASYNC)
- xfs_buf_ioacct_inc(bp);
- _xfs_buf_ioapply(bp);
+ bp->b_error = 0;
- /*
- * If _xfs_buf_ioapply failed, we can get back here with only the IO
- * reference we took above. If we drop it to zero, run completion so
- * that we don't return to the caller with completion still pending.
- */
- if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
- if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
- xfs_buf_ioend(bp);
- else
- xfs_buf_ioend_async(bp);
+ if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) {
+ xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE);
+ xfs_buf_ioend(bp);
+ return;
}
- if (wait)
- error = xfs_buf_iowait(bp);
+ /* In-memory targets are directly mapped, no I/O required. */
+ if (xfs_buftarg_is_mem(bp->b_target)) {
+ xfs_buf_ioend(bp);
+ return;
+ }
- /*
- * Release the hold that keeps the buffer referenced for the entire
- * I/O. Note that if the buffer is async, it is not safe to reference
- * after this release.
- */
- xfs_buf_rele(bp);
- return error;
+ xfs_buf_submit_bio(bp);
}
void *
@@ -1863,13 +1693,14 @@ xfs_buftarg_drain_rele(
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
struct list_head *dispose = arg;
- if (atomic_read(&bp->b_hold) > 1) {
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+ if (bp->b_hold > 1) {
/* need to wait, so skip it this pass */
+ spin_unlock(&bp->b_lock);
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
- if (!spin_trylock(&bp->b_lock))
- return LRU_SKIP;
/*
* clear the LRU reference count so the buffer doesn't get
@@ -1890,9 +1721,8 @@ xfs_buftarg_wait(
struct xfs_buftarg *btp)
{
/*
- * First wait on the buftarg I/O count for all in-flight buffers to be
- * released. This is critical as new buffers do not make the LRU until
- * they are released.
+ * First wait for all in-flight readahead buffers to be released. This is
+ * critical as new buffers do not make the LRU until they are released.
*
* Next, flush the buffer workqueue to ensure all completion processing
* has finished. Just waiting on buffer locks is not sufficient for
@@ -1901,7 +1731,7 @@ xfs_buftarg_wait(
* all reference counts have been dropped before we start walking the
* LRU list.
*/
- while (percpu_counter_sum(&btp->bt_io_count))
+ while (percpu_counter_sum(&btp->bt_readahead_count))
delay(100);
flush_workqueue(btp->bt_mount->m_buf_workqueue);
}
@@ -2018,8 +1848,8 @@ xfs_destroy_buftarg(
struct xfs_buftarg *btp)
{
shrinker_free(btp->bt_shrinker);
- ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
- percpu_counter_destroy(&btp->bt_io_count);
+ ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
+ percpu_counter_destroy(&btp->bt_readahead_count);
list_lru_destroy(&btp->bt_lru);
}
@@ -2073,7 +1903,7 @@ xfs_init_buftarg(
if (list_lru_init(&btp->bt_lru))
return -ENOMEM;
- if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+ if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
goto out_destroy_lru;
btp->bt_shrinker =
@@ -2087,7 +1917,7 @@ xfs_init_buftarg(
return 0;
out_destroy_io_count:
- percpu_counter_destroy(&btp->bt_io_count);
+ percpu_counter_destroy(&btp->bt_readahead_count);
out_destroy_lru:
list_lru_destroy(&btp->bt_lru);
return -ENOMEM;
@@ -2208,7 +2038,7 @@ xfs_buf_delwri_queue(
*/
bp->b_flags |= _XBF_DELWRI_Q;
if (list_empty(&bp->b_list)) {
- atomic_inc(&bp->b_hold);
+ xfs_buf_hold(bp);
list_add_tail(&bp->b_list, list);
}
@@ -2266,72 +2096,26 @@ xfs_buf_cmp(
return 0;
}
-/*
- * Submit buffers for write. If wait_list is specified, the buffers are
- * submitted using sync I/O and placed on the wait list such that the caller can
- * iowait each buffer. Otherwise async I/O is used and the buffers are released
- * at I/O completion time. In either case, buffers remain locked until I/O
- * completes and the buffer is released from the queue.
- */
-static int
-xfs_buf_delwri_submit_buffers(
- struct list_head *buffer_list,
- struct list_head *wait_list)
+static bool
+xfs_buf_delwri_submit_prep(
+ struct xfs_buf *bp)
{
- struct xfs_buf *bp, *n;
- int pinned = 0;
- struct blk_plug plug;
-
- list_sort(NULL, buffer_list, xfs_buf_cmp);
-
- blk_start_plug(&plug);
- list_for_each_entry_safe(bp, n, buffer_list, b_list) {
- if (!wait_list) {
- if (!xfs_buf_trylock(bp))
- continue;
- if (xfs_buf_ispinned(bp)) {
- xfs_buf_unlock(bp);
- pinned++;
- continue;
- }
- } else {
- xfs_buf_lock(bp);
- }
-
- /*
- * Someone else might have written the buffer synchronously or
- * marked it stale in the meantime. In that case only the
- * _XBF_DELWRI_Q flag got cleared, and we have to drop the
- * reference and remove it from the list here.
- */
- if (!(bp->b_flags & _XBF_DELWRI_Q)) {
- xfs_buf_list_del(bp);
- xfs_buf_relse(bp);
- continue;
- }
-
- trace_xfs_buf_delwri_split(bp, _RET_IP_);
-
- /*
- * If we have a wait list, each buffer (and associated delwri
- * queue reference) transfers to it and is submitted
- * synchronously. Otherwise, drop the buffer from the delwri
- * queue and submit async.
- */
- bp->b_flags &= ~_XBF_DELWRI_Q;
- bp->b_flags |= XBF_WRITE;
- if (wait_list) {
- bp->b_flags &= ~XBF_ASYNC;
- list_move_tail(&bp->b_list, wait_list);
- } else {
- bp->b_flags |= XBF_ASYNC;
- xfs_buf_list_del(bp);
- }
- __xfs_buf_submit(bp, false);
+ /*
+ * Someone else might have written the buffer synchronously or marked it
+ * stale in the meantime. In that case only the _XBF_DELWRI_Q flag got
+ * cleared, and we have to drop the reference and remove it from the
+ * list here.
+ */
+ if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+ xfs_buf_list_del(bp);
+ xfs_buf_relse(bp);
+ return false;
}
- blk_finish_plug(&plug);
- return pinned;
+ trace_xfs_buf_delwri_split(bp, _RET_IP_);
+ bp->b_flags &= ~_XBF_DELWRI_Q;
+ bp->b_flags |= XBF_WRITE;
+ return true;
}
/*
@@ -2354,7 +2138,30 @@ int
xfs_buf_delwri_submit_nowait(
struct list_head *buffer_list)
{
- return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
+ struct xfs_buf *bp, *n;
+ int pinned = 0;
+ struct blk_plug plug;
+
+ list_sort(NULL, buffer_list, xfs_buf_cmp);
+
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(bp, n, buffer_list, b_list) {
+ if (!xfs_buf_trylock(bp))
+ continue;
+ if (xfs_buf_ispinned(bp)) {
+ xfs_buf_unlock(bp);
+ pinned++;
+ continue;
+ }
+ if (!xfs_buf_delwri_submit_prep(bp))
+ continue;
+ bp->b_flags |= XBF_ASYNC;
+ xfs_buf_list_del(bp);
+ xfs_buf_submit(bp);
+ }
+ blk_finish_plug(&plug);
+
+ return pinned;
}
/*
@@ -2371,9 +2178,21 @@ xfs_buf_delwri_submit(
{
LIST_HEAD (wait_list);
int error = 0, error2;
- struct xfs_buf *bp;
+ struct xfs_buf *bp, *n;
+ struct blk_plug plug;
+
+ list_sort(NULL, buffer_list, xfs_buf_cmp);
- xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(bp, n, buffer_list, b_list) {
+ xfs_buf_lock(bp);
+ if (!xfs_buf_delwri_submit_prep(bp))
+ continue;
+ bp->b_flags &= ~XBF_ASYNC;
+ list_move_tail(&bp->b_list, &wait_list);
+ xfs_buf_submit(bp);
+ }
+ blk_finish_plug(&plug);
/* Wait for IO to complete. */
while (!list_empty(&wait_list)) {
@@ -2398,14 +2217,9 @@ xfs_buf_delwri_submit(
* Push a single buffer on a delwri queue.
*
* The purpose of this function is to submit a single buffer of a delwri queue
- * and return with the buffer still on the original queue. The waiting delwri
- * buffer submission infrastructure guarantees transfer of the delwri queue
- * buffer reference to a temporary wait list. We reuse this infrastructure to
- * transfer the buffer back to the original queue.
+ * and return with the buffer still on the original queue.
*
- * Note the buffer transitions from the queued state, to the submitted and wait
- * listed state and back to the queued state during this call. The buffer
- * locking and queue management logic between _delwri_pushbuf() and
+ * The buffer locking and queue management logic between _delwri_pushbuf() and
* _delwri_queue() guarantee that the buffer cannot be queued to another list
* before returning.
*/
@@ -2414,33 +2228,21 @@ xfs_buf_delwri_pushbuf(
struct xfs_buf *bp,
struct list_head *buffer_list)
{
- LIST_HEAD (submit_list);
int error;
ASSERT(bp->b_flags & _XBF_DELWRI_Q);
trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
- /*
- * Isolate the buffer to a new local list so we can submit it for I/O
- * independently from the rest of the original list.
- */
xfs_buf_lock(bp);
- list_move(&bp->b_list, &submit_list);
- xfs_buf_unlock(bp);
-
- /*
- * Delwri submission clears the DELWRI_Q buffer flag and returns with
- * the buffer on the wait list with the original reference. Rather than
- * bounce the buffer from a local wait list back to the original list
- * after I/O completion, reuse the original list as the wait list.
- */
- xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
+ bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
+ bp->b_flags |= XBF_WRITE;
+ xfs_buf_submit(bp);
/*
- * The buffer is now locked, under I/O and wait listed on the original
- * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
- * return with the buffer unlocked and on the original queue.
+ * The buffer is now locked, under I/O but still on the original delwri
+ * queue. Wait for I/O completion, restore the DELWRI_Q flag and
+ * return with the buffer unlocked and still on the original queue.
*/
error = xfs_buf_iowait(bp);
bp->b_flags |= _XBF_DELWRI_Q;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3d56bc7a35cc..80e06eecaf56 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -27,15 +27,12 @@ struct xfs_buf;
#define XBF_READ (1u << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
-#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
/* buffer type flags for write callbacks */
-#define _XBF_INODES (1u << 16)/* inode buffer */
-#define _XBF_DQUOTS (1u << 17)/* dquot buffer */
#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
@@ -60,13 +57,10 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_READ, "READ" }, \
{ XBF_WRITE, "WRITE" }, \
{ XBF_READ_AHEAD, "READ_AHEAD" }, \
- { XBF_NO_IOACCT, "NO_IOACCT" }, \
{ XBF_ASYNC, "ASYNC" }, \
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
- { _XBF_INODES, "INODES" }, \
- { _XBF_DQUOTS, "DQUOTS" }, \
{ _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
@@ -81,10 +75,8 @@ typedef unsigned int xfs_buf_flags_t;
* Internal state flags.
*/
#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
-#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */
struct xfs_buf_cache {
- spinlock_t bc_lock;
struct rhashtable bc_hash;
};
@@ -121,7 +113,7 @@ struct xfs_buftarg {
struct shrinker *bt_shrinker;
struct list_lru bt_lru;
- struct percpu_counter bt_io_count;
+ struct percpu_counter bt_readahead_count;
struct ratelimit_state bt_ioerror_rl;
/* Atomic write unit values */
@@ -172,7 +164,7 @@ struct xfs_buf {
xfs_daddr_t b_rhash_key; /* buffer cache index */
int b_length; /* size of buffer in BBs */
- atomic_t b_hold; /* reference count */
+ unsigned int b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
@@ -184,10 +176,9 @@ struct xfs_buf {
struct list_head b_lru; /* lru list */
spinlock_t b_lock; /* internal state lock */
unsigned int b_state; /* internal state flags */
- int b_io_error; /* internal IO error state */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
- struct xfs_perag *b_pag; /* contains rbtree root */
+ struct xfs_perag *b_pag;
struct xfs_mount *b_mount;
struct xfs_buftarg *b_target; /* buffer target (device) */
void *b_addr; /* virtual address of buffer */
@@ -202,11 +193,11 @@ struct xfs_buf {
struct xfs_buf_map __b_map; /* inline compound buffer map */
int b_map_count;
atomic_t b_pin_count; /* pin count */
- atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset of b_addr,
only for _XBF_KMEM buffers */
int b_error; /* error code on I/O */
+ void (*b_iodone)(struct xfs_buf *bp);
/*
* async write failure retry count. Initialised to zero on the first
@@ -297,7 +288,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
const struct xfs_buf_ops *ops);
-int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
+int _xfs_buf_read(struct xfs_buf *bp);
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 4d8a6aece995..8cde85259a58 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -54,17 +54,12 @@ bool xfs_buf_item_put(struct xfs_buf_log_item *);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_inode_iodone(struct xfs_buf *);
-void xfs_buf_inode_io_fail(struct xfs_buf *bp);
#ifdef CONFIG_XFS_QUOTA
void xfs_buf_dquot_iodone(struct xfs_buf *);
-void xfs_buf_dquot_io_fail(struct xfs_buf *bp);
#else
static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
{
}
-static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
-{
-}
#endif /* CONFIG_XFS_QUOTA */
void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 3d0c6402cb36..05a2f6927c12 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -262,12 +262,18 @@ xlog_recover_validate_buf_type(
case XFS_BMAP_MAGIC:
bp->b_ops = &xfs_bmbt_buf_ops;
break;
+ case XFS_RTRMAP_CRC_MAGIC:
+ bp->b_ops = &xfs_rtrmapbt_buf_ops;
+ break;
case XFS_RMAP_CRC_MAGIC:
bp->b_ops = &xfs_rmapbt_buf_ops;
break;
case XFS_REFC_CRC_MAGIC:
bp->b_ops = &xfs_refcountbt_buf_ops;
break;
+ case XFS_RTREFC_CRC_MAGIC:
+ bp->b_ops = &xfs_rtrefcountbt_buf_ops;
+ break;
default:
warnmsg = "Bad btree block magic!";
break;
@@ -855,6 +861,8 @@ xlog_recover_get_buf_lsn(
uuid = &btb->bb_u.s.bb_uuid;
break;
}
+ case XFS_RTRMAP_CRC_MAGIC:
+ case XFS_RTREFC_CRC_MAGIC:
case XFS_BMAP_CRC_MAGIC:
case XFS_BMAP_MAGIC: {
struct xfs_btree_block *btb = blk;
@@ -1079,7 +1087,7 @@ xlog_recover_buf_commit_pass2(
error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f,
current_lsn);
if (error)
- goto out_release;
+ goto out_writebuf;
/* Update the rt superblock if we have one. */
if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) {
@@ -1097,6 +1105,15 @@ xlog_recover_buf_commit_pass2(
}
/*
+ * Buffer held by buf log item during 'normal' buffer recovery must
+ * be committed through buffer I/O submission path to ensure proper
+ * release. When error occurs during sb buffer recovery, log shutdown
+ * will be done before submitting buffer list so that buffers can be
+ * released correctly through ioend failure path.
+ */
+out_writebuf:
+
+ /*
* Perform delayed write on the buffer. Asynchronous writes will be
* slower when taking into account all the buffers to be flushed.
*
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 07bebbfb16ee..5b64a2b3b113 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -117,7 +117,7 @@ xmbuf_free(
struct xfs_buftarg *btp)
{
ASSERT(xfs_buftarg_is_mem(btp));
- ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+ ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
trace_xmbuf_free(btp);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index c4bd145f5ec1..3f2403a7b49c 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -90,7 +90,7 @@ xfs_discard_endio_work(
/*
* Queue up the actual completion to a thread to avoid IRQ-safe locking for
- * pagb_lock.
+ * eb_lock.
*/
static void
xfs_discard_endio(
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 201c26322ede..edbc521870a1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1230,18 +1230,6 @@ xfs_buf_dquot_iodone(
}
}
-void
-xfs_buf_dquot_io_fail(
- struct xfs_buf *bp)
-{
- struct xfs_log_item *lip;
-
- spin_lock(&bp->b_mount->m_ail->ail_lock);
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
- set_bit(XFS_LI_FAILED, &lip->li_flags);
- spin_unlock(&bp->b_mount->m_ail->ail_lock);
-}
-
/* Check incore dquot for errors before we flush. */
static xfs_failaddr_t
xfs_qm_dqflush_check(
@@ -1316,7 +1304,8 @@ out_abort:
/*
* Attach a dquot buffer to this dquot to avoid allocating a buffer during a
- * dqflush, since dqflush can be called from reclaim context.
+ * dqflush, since dqflush can be called from reclaim context. Caller must hold
+ * the dqlock.
*/
int
xfs_dquot_attach_buf(
@@ -1337,13 +1326,16 @@ xfs_dquot_attach_buf(
return error;
/*
- * Attach the dquot to the buffer so that the AIL does not have
- * to read the dquot buffer to push this item.
+ * Hold the dquot buffer so that we retain our ref to it after
+ * detaching it from the transaction, then give that ref to the
+ * dquot log item so that the AIL does not have to read the
+ * dquot buffer to push this item.
*/
xfs_buf_hold(bp);
+ xfs_trans_brelse(tp, bp);
+
spin_lock(&qlip->qli_lock);
lip->li_buf = bp;
- xfs_trans_brelse(tp, bp);
}
qlip->qli_dirty = true;
spin_unlock(&qlip->qli_lock);
@@ -1459,7 +1451,7 @@ xfs_qm_dqflush(
* Attach the dquot to the buffer so that we can remove this dquot from
* the AIL and release the flush lock once the dquot is synced to disk.
*/
- bp->b_flags |= _XBF_DQUOTS;
+ bp->b_iodone = xfs_buf_dquot_iodone;
list_add_tail(&lip->li_bio_list, &bp->b_li_list);
/*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c617bac75361..61217adf5ba5 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -160,6 +160,9 @@ static inline struct xfs_dquot *xfs_inode_dquot(
struct xfs_inode *ip,
xfs_dqtype_t type)
{
+ if (xfs_is_metadir_inode(ip))
+ return NULL;
+
switch (type) {
case XFS_DQTYPE_USER:
return ip->i_udquot;
diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c
index 5ede81fadbd8..fa5f31931efd 100644
--- a/fs/xfs/xfs_drain.c
+++ b/fs/xfs/xfs_drain.c
@@ -13,28 +13,28 @@
#include "xfs_trace.h"
/*
- * Use a static key here to reduce the overhead of xfs_drain_rele. If the
- * compiler supports jump labels, the static branch will be replaced by a nop
- * sled when there are no xfs_drain_wait callers. Online fsck is currently
- * the only caller, so this is a reasonable tradeoff.
+ * Use a static key here to reduce the overhead of xfs_defer_drain_rele. If
+ * the compiler supports jump labels, the static branch will be replaced by a
+ * nop sled when there are no xfs_defer_drain_wait callers. Online fsck is
+ * currently the only caller, so this is a reasonable tradeoff.
*
* Note: Patching the kernel code requires taking the cpu hotplug lock. Other
* parts of the kernel allocate memory with that lock held, which means that
* XFS callers cannot hold any locks that might be used by memory reclaim or
* writeback when calling the static_branch_{inc,dec} functions.
*/
-static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate);
+static DEFINE_STATIC_KEY_FALSE(xfs_defer_drain_waiter_gate);
void
-xfs_drain_wait_disable(void)
+xfs_defer_drain_wait_disable(void)
{
- static_branch_dec(&xfs_drain_waiter_gate);
+ static_branch_dec(&xfs_defer_drain_waiter_gate);
}
void
-xfs_drain_wait_enable(void)
+xfs_defer_drain_wait_enable(void)
{
- static_branch_inc(&xfs_drain_waiter_gate);
+ static_branch_inc(&xfs_defer_drain_waiter_gate);
}
void
@@ -71,7 +71,7 @@ static inline bool has_waiters(struct wait_queue_head *wq_head)
static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr)
{
if (atomic_dec_and_test(&dr->dr_count) &&
- static_branch_unlikely(&xfs_drain_waiter_gate) &&
+ static_branch_unlikely(&xfs_defer_drain_waiter_gate) &&
has_waiters(&dr->dr_waiters))
wake_up(&dr->dr_waiters);
}
diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h
index efcf88df9a5e..4d446dbf65e5 100644
--- a/fs/xfs/xfs_drain.h
+++ b/fs/xfs/xfs_drain.h
@@ -26,8 +26,8 @@ struct xfs_defer_drain {
void xfs_defer_drain_init(struct xfs_defer_drain *dr);
void xfs_defer_drain_free(struct xfs_defer_drain *dr);
-void xfs_drain_wait_disable(void);
-void xfs_drain_wait_enable(void);
+void xfs_defer_drain_wait_disable(void);
+void xfs_defer_drain_wait_enable(void);
/*
* Deferred Work Intent Drains
@@ -61,6 +61,9 @@ void xfs_drain_wait_enable(void);
* All functions that create work items must increment the intent counter as
* soon as the item is added to the transaction and cannot drop the counter
* until the item is finished or cancelled.
+ *
+ * The same principles apply to realtime groups because the rt metadata inode
+ * ILOCKs are not held across transaction rolls.
*/
struct xfs_group *xfs_group_intent_get(struct xfs_mount *mp,
xfs_fsblock_t fsbno, enum xfs_group_type type);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 78cdc5064a8c..dbd87e137694 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -63,6 +63,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_WB_DELAY_MS,
XFS_RANDOM_WRITE_DELAY_MS,
XFS_RANDOM_EXCHMAPS_FINISH_ONE,
+ XFS_RANDOM_METAFILE_RESV_CRITICAL,
};
struct xfs_errortag_attr {
@@ -181,6 +182,7 @@ XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS);
XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS);
XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -227,6 +229,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one),
+ XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index 265c42449893..0b41bdfecdfb 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -119,6 +119,9 @@ xfs_exchrange_reserve_quota(
int ip1_error = 0;
int error;
+ ASSERT(!xfs_is_metadir_inode(req->ip1));
+ ASSERT(!xfs_is_metadir_inode(req->ip2));
+
/*
* Don't bother with a quota reservation if we're not enforcing them
* or the two inodes have the same dquots.
@@ -326,22 +329,6 @@ out_trans_cancel:
* successfully but before locks are dropped.
*/
-/* Verify that we have security clearance to perform this operation. */
-static int
-xfs_exchange_range_verify_area(
- struct xfs_exchrange *fxr)
-{
- int ret;
-
- ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
- true);
- if (ret)
- return ret;
-
- return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
- true);
-}
-
/*
* Performs necessary checks before doing a range exchange, having stabilized
* mutable inode attributes via i_rwsem.
@@ -352,11 +339,13 @@ xfs_exchange_range_checks(
unsigned int alloc_unit)
{
struct inode *inode1 = file_inode(fxr->file1);
+ loff_t size1 = i_size_read(inode1);
struct inode *inode2 = file_inode(fxr->file2);
+ loff_t size2 = i_size_read(inode2);
uint64_t allocmask = alloc_unit - 1;
int64_t test_len;
uint64_t blen;
- loff_t size1, size2, tmp;
+ loff_t tmp;
int error;
/* Don't touch certain kinds of inodes */
@@ -365,24 +354,25 @@ xfs_exchange_range_checks(
if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
return -ETXTBSY;
- size1 = i_size_read(inode1);
- size2 = i_size_read(inode2);
-
/* Ranges cannot start after EOF. */
if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
return -EINVAL;
- /*
- * If the caller said to exchange to EOF, we set the length of the
- * request large enough to cover everything to the end of both files.
- */
if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
+ /*
+ * If the caller said to exchange to EOF, we set the length of
+ * the request large enough to cover everything to the end of
+ * both files.
+ */
fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
size2 - fxr->file2_offset);
-
- error = xfs_exchange_range_verify_area(fxr);
- if (error)
- return error;
+ } else {
+ /*
+ * Otherwise we require both ranges to end within EOF.
+ */
+ if (fxr->file1_offset + fxr->length > size1 ||
+ fxr->file2_offset + fxr->length > size2)
+ return -EINVAL;
}
/*
@@ -399,15 +389,6 @@ xfs_exchange_range_checks(
return -EINVAL;
/*
- * We require both ranges to end within EOF, unless we're exchanging
- * to EOF.
- */
- if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
- (fxr->file1_offset + fxr->length > size1 ||
- fxr->file2_offset + fxr->length > size2))
- return -EINVAL;
-
- /*
* Make sure we don't hit any file size limits. If we hit any size
* limits such that test_length was adjusted, we abort the whole
* operation.
@@ -744,6 +725,7 @@ xfs_exchange_range(
{
struct inode *inode1 = file_inode(fxr->file1);
struct inode *inode2 = file_inode(fxr->file2);
+ loff_t check_len = fxr->length;
int ret;
BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
@@ -776,14 +758,18 @@ xfs_exchange_range(
return -EBADF;
/*
- * If we're not exchanging to EOF, we can check the areas before
- * stabilizing both files' i_size.
+ * If we're exchanging to EOF we can't calculate the length until taking
+ * the iolock. Pass a 0 length to remap_verify_area similar to the
+ * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well.
*/
- if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
- ret = xfs_exchange_range_verify_area(fxr);
- if (ret)
- return ret;
- }
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
+ check_len = 0;
+ ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true);
+ if (ret)
+ return ret;
+ ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true);
+ if (ret)
+ return ret;
/* Update cmtime if the fd/inode don't forbid it. */
if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a435b1ff264..85b857805d6d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1498,7 +1498,8 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
- ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
+ NULL);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
@@ -1613,7 +1614,8 @@ const struct file_operations xfs_file_operations = {
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 3290dd8524a6..1dbd2d75f7ae 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -26,6 +26,8 @@
#include "xfs_rtbitmap.h"
#include "xfs_ag.h"
#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
/* Convert an xfs_fsmap to an fsmap. */
static void
@@ -211,21 +213,20 @@ xfs_getfsmap_is_shared(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_btree_cur *cur;
xfs_agblock_t fbno;
- xfs_extlen_t flen;
+ xfs_extlen_t flen = 0;
int error;
*stat = false;
- if (!xfs_has_reflink(mp))
- return 0;
- /* rt files will have no perag structure */
- if (!info->group)
+ if (!xfs_has_reflink(mp) || !info->group)
return 0;
- /* Are there any shared blocks here? */
- flen = 0;
- cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
- to_perag(info->group));
+ if (info->group->xg_type == XG_TYPE_RTG)
+ cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(info->group));
+ else
+ cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
+ to_perag(info->group));
+ /* Are there any shared blocks here? */
error = xfs_refcount_find_shared(cur, frec->rec_key,
XFS_BB_TO_FSBT(mp, frec->len_daddr), &fbno, &flen,
false);
@@ -832,6 +833,175 @@ xfs_getfsmap_rtdev_rtbitmap(
return error;
}
+
+/* Transform a realtime rmapbt record into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rmapbt_helper(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_fsmap_irec frec = {
+ .owner = rec->rm_owner,
+ .offset = rec->rm_offset,
+ .rm_flags = rec->rm_flags,
+ .rec_key = rec->rm_startblock,
+ };
+ struct xfs_getfsmap_info *info = priv;
+
+ return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group,
+ rec->rm_startblock, rec->rm_blockcount, &frec);
+}
+
+/* Actually query the rtrmap btree. */
+STATIC int
+xfs_getfsmap_rtdev_rmapbt_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_btree_cur **curpp)
+{
+ struct xfs_rtgroup *rtg = to_rtg(info->group);
+
+ /* Query the rtrmapbt */
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
+ *curpp = xfs_rtrmapbt_init_cursor(tp, rtg);
+ return xfs_rmap_query_range(*curpp, &info->low, &info->high,
+ xfs_getfsmap_rtdev_rmapbt_helper, info);
+}
+
+/* Execute a getfsmap query against the realtime device rmapbt. */
+STATIC int
+xfs_getfsmap_rtdev_rmapbt(
+ struct xfs_trans *tp,
+ const struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rtgroup *rtg = NULL;
+ struct xfs_btree_cur *bt_cur = NULL;
+ xfs_rtblock_t start_rtb;
+ xfs_rtblock_t end_rtb;
+ xfs_rgnumber_t start_rg, end_rg;
+ uint64_t eofs;
+ int error = 0;
+
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical);
+ end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
+
+ info->missing_owner = XFS_FMR_OWN_FREE;
+
+ /*
+ * Convert the fsmap low/high keys to rtgroup based keys. Initialize
+ * low to the fsmap low key and max out the high key to the end
+ * of the rtgroup.
+ */
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ if (error)
+ return error;
+ info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ /* Adjust the low key if we are continuing from where we left off. */
+ if (info->low.rm_blockcount == 0) {
+ /* No previous record from which to continue */
+ } else if (rmap_not_shareable(mp, &info->low)) {
+ /* Last record seen was an unshareable extent */
+ info->low.rm_owner = 0;
+ info->low.rm_offset = 0;
+
+ start_rtb += info->low.rm_blockcount;
+ if (xfs_rtb_to_daddr(mp, start_rtb) >= eofs)
+ return 0;
+ } else {
+ /* Last record seen was a shareable file data extent */
+ info->low.rm_offset += info->low.rm_blockcount;
+ }
+ info->low.rm_startblock = xfs_rtb_to_rgbno(mp, start_rtb);
+
+ info->high.rm_startblock = -1U;
+ info->high.rm_owner = ULLONG_MAX;
+ info->high.rm_offset = ULLONG_MAX;
+ info->high.rm_blockcount = 0;
+ info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+
+ start_rg = xfs_rtb_to_rgno(mp, start_rtb);
+ end_rg = xfs_rtb_to_rgno(mp, end_rtb);
+
+ while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rg, end_rg))) {
+ /*
+ * Set the rtgroup high key from the fsmap high key if this
+ * is the last rtgroup that we're querying.
+ */
+ info->group = rtg_group(rtg);
+ if (rtg_rgno(rtg) == end_rg) {
+ info->high.rm_startblock =
+ xfs_rtb_to_rgbno(mp, end_rtb);
+ info->high.rm_offset =
+ XFS_BB_TO_FSBT(mp, keys[1].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+ if (error)
+ break;
+ xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+ }
+
+ if (bt_cur) {
+ xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group),
+ XFS_RTGLOCK_RMAP |
+ XFS_RTGLOCK_REFCOUNT);
+ xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+ bt_cur = NULL;
+ }
+
+ trace_xfs_fsmap_low_group_key(mp, info->dev, rtg_rgno(rtg),
+ &info->low);
+ trace_xfs_fsmap_high_group_key(mp, info->dev, rtg_rgno(rtg),
+ &info->high);
+
+ error = xfs_getfsmap_rtdev_rmapbt_query(tp, info, &bt_cur);
+ if (error)
+ break;
+
+ /*
+ * Set the rtgroup low key to the start of the rtgroup prior to
+ * moving on to the next rtgroup.
+ */
+ if (rtg_rgno(rtg) == start_rg)
+ memset(&info->low, 0, sizeof(info->low));
+
+ /*
+ * If this is the last rtgroup, report any gap at the end of it
+ * before we drop the reference to the perag when the loop
+ * terminates.
+ */
+ if (rtg_rgno(rtg) == end_rg) {
+ info->last = true;
+ error = xfs_getfsmap_rtdev_rmapbt_helper(bt_cur,
+ &info->high, info);
+ if (error)
+ break;
+ }
+ info->group = NULL;
+ }
+
+ if (bt_cur) {
+ xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group),
+ XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
+ xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ }
+
+ /* loop termination case */
+ if (rtg) {
+ info->group = NULL;
+ xfs_rtgroup_rele(rtg);
+ }
+
+ return error;
+}
#endif /* CONFIG_XFS_RT */
/* Do we recognize the device? */
@@ -971,7 +1141,10 @@ xfs_getfsmap(
if (mp->m_rtdev_targp) {
handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
- handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
+ if (use_rmap)
+ handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
+ else
+ handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
}
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 28dde215c899..455298503d01 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -21,6 +21,9 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_trace.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
/*
* Write new AG headers to disk. Non-transactional, but need to be
@@ -113,6 +116,12 @@ xfs_growfs_data_private(
xfs_buf_relse(bp);
}
+ /* Make sure the new fs size won't cause problems with the log. */
+ error = xfs_growfs_check_rtgeom(mp, nb, mp->m_sb.sb_rblocks,
+ mp->m_sb.sb_rextsize);
+ if (error)
+ return error;
+
nb_div = nb;
nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS)
@@ -220,7 +229,12 @@ xfs_growfs_data_private(
error = xfs_fs_reserve_ag_blocks(mp);
if (error == -ENOSPC)
error = 0;
+
+ /* Compute new maxlevels for rt btrees. */
+ xfs_rtrmapbt_compute_maxlevels(mp);
+ xfs_rtrefcountbt_compute_maxlevels(mp);
}
+
return error;
out_trans_cancel:
@@ -541,6 +555,19 @@ xfs_fs_reserve_ag_blocks(
xfs_warn(mp,
"Error %d reserving per-AG metadata reserve pool.", error);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ if (xfs_has_realtime(mp)) {
+ err2 = xfs_rt_resv_init(mp);
+ if (err2 && err2 != -ENOSPC) {
+ xfs_warn(mp,
+ "Error %d reserving realtime metadata reserve pool.", err2);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+
+ if (err2 && !error)
+ error = err2;
}
return error;
@@ -555,6 +582,9 @@ xfs_fs_unreserve_ag_blocks(
{
struct xfs_perag *pag = NULL;
+ if (xfs_has_realtime(mp))
+ xfs_rt_resv_free(mp);
+
while ((pag = xfs_perag_next(mp, pag)))
xfs_ag_resv_free(pag);
}
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index c7c2e6561998..7c541fb373d5 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -447,6 +447,8 @@ static const struct ioctl_sick_map rtgroup_map[] = {
{ XFS_SICK_RG_SUPER, XFS_RTGROUP_GEOM_SICK_SUPER },
{ XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP },
{ XFS_SICK_RG_SUMMARY, XFS_RTGROUP_GEOM_SICK_SUMMARY },
+ { XFS_SICK_RG_RMAPBT, XFS_RTGROUP_GEOM_SICK_RMAPBT },
+ { XFS_SICK_RG_REFCNTBT, XFS_RTGROUP_GEOM_SICK_REFCNTBT },
};
/* Fill out rtgroup geometry health info. */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c8ad2606f928..b1f9f156ec88 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1404,8 +1404,11 @@ xfs_inactive(
goto out;
/* Try to clean out the cow blocks if there are any. */
- if (xfs_inode_has_cow_data(ip))
- xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+ if (xfs_inode_has_cow_data(ip)) {
+ error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+ if (error)
+ goto out;
+ }
if (VFS_I(ip)->i_nlink != 0) {
/*
@@ -2382,7 +2385,16 @@ xfs_iflush(
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
goto flush_out;
}
- if (S_ISREG(VFS_I(ip)->i_mode)) {
+ if (ip->i_df.if_format == XFS_DINODE_FMT_META_BTREE) {
+ if (!S_ISREG(VFS_I(ip)->i_mode) ||
+ !(ip->i_diflags2 & XFS_DIFLAG2_METADATA)) {
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: Bad %s meta btree inode %Lu, ptr "PTR_FMT,
+ __func__, xfs_metafile_type_str(ip->i_metatype),
+ ip->i_ino, ip);
+ goto flush_out;
+ }
+ } else if (S_ISREG(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
@@ -2422,6 +2434,14 @@ xfs_iflush(
goto flush_out;
}
+ if (xfs_inode_has_attr_fork(ip) &&
+ ip->i_af.if_format == XFS_DINODE_FMT_META_BTREE) {
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: meta btree in inode %Lu attr fork, ptr "PTR_FMT,
+ __func__, ip->i_ino, ip);
+ goto flush_out;
+ }
+
/*
* Inode item log recovery for v2 inodes are dependent on the flushiter
* count for correct sequencing. We bump the flush iteration count so
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1648dc5a8068..c08093a65352 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -25,9 +25,19 @@ struct xfs_dquot;
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_mount *i_mount; /* fs mount struct ptr */
- struct xfs_dquot *i_udquot; /* user dquot */
- struct xfs_dquot *i_gdquot; /* group dquot */
- struct xfs_dquot *i_pdquot; /* project dquot */
+ union {
+ struct {
+ struct xfs_dquot *i_udquot; /* user dquot */
+ struct xfs_dquot *i_gdquot; /* group dquot */
+ struct xfs_dquot *i_pdquot; /* project dquot */
+ };
+
+ /*
+ * Space that has been set aside to accomodate expansions of a
+ * metadata btree rooted in this file.
+ */
+ uint64_t i_meta_resv_asked;
+ };
/* Inode location stuff */
xfs_ino_t i_ino; /* inode number (agno/agino)*/
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 912f0b1bc3cb..35803fcf0beb 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -157,6 +157,20 @@ xfs_inode_item_precommit(
if (flags & XFS_ILOG_IVERSION)
flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
+ /*
+ * Inode verifiers do not check that the CoW extent size hint is an
+ * integer multiple of the rt extent size on a directory with both
+ * rtinherit and cowextsize flags set. If we're logging a directory
+ * that is misconfigured in this way, clear the hint.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+ ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
+
if (!iip->ili_item.li_buf) {
struct xfs_buf *bp;
int error;
@@ -185,7 +199,7 @@ xfs_inode_item_precommit(
xfs_buf_hold(bp);
spin_lock(&iip->ili_lock);
iip->ili_item.li_buf = bp;
- bp->b_flags |= _XBF_INODES;
+ bp->b_iodone = xfs_buf_inode_iodone;
list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
xfs_trans_brelse(tp, bp);
}
@@ -242,6 +256,7 @@ xfs_inode_item_data_fork_size(
}
break;
case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_META_BTREE:
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
ip->i_df.if_broot_bytes > 0) {
*nbytes += ip->i_df.if_broot_bytes;
@@ -362,6 +377,7 @@ xfs_inode_item_format_data_fork(
}
break;
case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_META_BTREE:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
@@ -1023,18 +1039,6 @@ xfs_buf_inode_iodone(
list_splice_tail(&flushed_inodes, &bp->b_li_list);
}
-void
-xfs_buf_inode_io_fail(
- struct xfs_buf *bp)
-{
- struct xfs_log_item *lip;
-
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- set_bit(XFS_LI_FAILED, &lip->li_flags);
- clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
- }
-}
-
/*
* Clear the inode logging fields so no more flushes are attempted. If we are
* on a buffer list, it is now safe to remove it because the buffer is
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index e70d2611456b..f3bfb814378c 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -22,6 +22,8 @@
#include "xfs_log_recover.h"
#include "xfs_icache.h"
#include "xfs_bmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
STATIC void
xlog_recover_inode_ra_pass2(
@@ -266,6 +268,41 @@ xlog_dinode_verify_extent_counts(
return 0;
}
+static inline int
+xlog_recover_inode_dbroot(
+ struct xfs_mount *mp,
+ void *src,
+ unsigned int len,
+ struct xfs_dinode *dip)
+{
+ void *dfork = XFS_DFORK_DPTR(dip);
+ unsigned int dsize = XFS_DFORK_DSIZE(dip, mp);
+
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_BTREE:
+ xfs_bmbt_to_bmdr(mp, src, len, dfork, dsize);
+ break;
+ case XFS_DINODE_FMT_META_BTREE:
+ switch (be16_to_cpu(dip->di_metatype)) {
+ case XFS_METAFILE_RTRMAP:
+ xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize);
+ return 0;
+ case XFS_METAFILE_RTREFCOUNT:
+ xfs_rtrefcountbt_to_disk(mp, src, len, dfork, dsize);
+ return 0;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
STATIC int
xlog_recover_inode_commit_pass2(
struct xlog *log,
@@ -393,8 +430,9 @@ xlog_recover_inode_commit_pass2(
if (unlikely(S_ISREG(ldip->di_mode))) {
- if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
+ if (ldip->di_format != XFS_DINODE_FMT_EXTENTS &&
+ ldip->di_format != XFS_DINODE_FMT_BTREE &&
+ ldip->di_format != XFS_DINODE_FMT_META_BTREE) {
XFS_CORRUPTION_ERROR(
"Bad log dinode data fork format for regular file",
XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
@@ -475,9 +513,9 @@ xlog_recover_inode_commit_pass2(
break;
case XFS_ILOG_DBROOT:
- xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
- (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip),
- XFS_DFORK_DSIZE(dip, mp));
+ error = xlog_recover_inode_dbroot(mp, src, len, dip);
+ if (error)
+ goto out_release;
break;
default:
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f95103325318..ed85322507dd 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -469,8 +469,21 @@ xfs_fill_fsxattr(
}
}
- if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
- fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+ if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+ /*
+ * Don't let a misaligned CoW extent size hint on a directory
+ * escape to userspace if it won't pass the setattr checks
+ * later.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ ip->i_cowextsize % mp->m_sb.sb_rextsize > 0) {
+ fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+ fa->fsx_cowextsize = 0;
+ } else {
+ fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+ }
+ }
+
fa->fsx_projid = ip->i_projid;
if (ifp && !xfs_need_iread_extents(ifp))
fa->fsx_nextents = xfs_iext_count(ifp);
@@ -541,10 +554,6 @@ xfs_ioctl_setattr_xflags(
if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
xfs_extlen_to_rtxmod(mp, ip->i_extsize))
return -EINVAL;
-
- /* Clear reflink if we are actually able to set the rt flag. */
- if (xfs_is_reflink_inode(ip))
- ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
}
/* diflags2 only valid for v3 inodes. */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 50fa3ef89f6c..f631177ac320 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -828,6 +828,10 @@ xfs_direct_write_iomap_begin(
if (offset + length > i_size_read(inode))
iomap_flags |= IOMAP_F_DIRTY;
+ /* HW-offload atomics are always used in this path */
+ if (flags & IOMAP_ATOMIC)
+ iomap_flags |= IOMAP_F_ATOMIC_BIO;
+
/*
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
@@ -976,10 +980,8 @@ xfs_dax_write_iomap_end(
if (!xfs_is_cow_inode(ip))
return 0;
- if (!written) {
- xfs_reflink_cancel_cow_range(ip, pos, length, true);
- return 0;
- }
+ if (!written)
+ return xfs_reflink_cancel_cow_range(ip, pos, length, true);
return xfs_reflink_end_cow(ip, pos, written);
}
@@ -1497,7 +1499,7 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
}
int
@@ -1512,5 +1514,5 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 40289fe6f5b2..a4480098d2bf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -298,14 +298,14 @@ xfs_vn_create(
return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL);
}
-STATIC int
+STATIC struct dentry *
xfs_vn_mkdir(
struct mnt_idmap *idmap,
struct inode *dir,
struct dentry *dentry,
umode_t mode)
{
- return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL);
+ return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL));
}
STATIC struct dentry *
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 05daad8a8d34..f8851ff835de 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2744,8 +2744,6 @@ xfs_log_ticket_regrant(
if (!ticket->t_cnt) {
xlog_grant_add_space(&log->l_reserve_head, ticket->t_unit_res);
trace_xfs_log_ticket_regrant_exit(log, ticket);
-
- ticket->t_curr_res = ticket->t_unit_res;
}
xfs_log_ticket_put(ticket);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0af3d477197b..2f76531842f8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1820,6 +1820,10 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
&xlog_xmd_item_ops,
&xlog_rtefi_item_ops,
&xlog_rtefd_item_ops,
+ &xlog_rtrui_item_ops,
+ &xlog_rtrud_item_ops,
+ &xlog_rtcui_item_ops,
+ &xlog_rtcud_item_ops,
};
static const struct xlog_recover_item_ops *
@@ -3376,7 +3380,7 @@ xlog_do_recover(
*/
xfs_buf_lock(bp);
xfs_buf_hold(bp);
- error = _xfs_buf_read(bp, XBF_READ);
+ error = _xfs_buf_read(bp);
if (error) {
if (!xlog_is_shutdown(log)) {
xfs_buf_ioerror_alert(bp, __this_address);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5918f433dba7..b69356582b86 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -37,6 +37,8 @@
#include "xfs_rtbitmap.h"
#include "xfs_metafile.h"
#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/stats.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -179,14 +181,11 @@ xfs_readsb(
/*
* Allocate a (locked) buffer to hold the superblock. This will be kept
- * around at all times to optimize access to the superblock. Therefore,
- * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
- * elevated.
+ * around at all times to optimize access to the superblock.
*/
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), XBF_NO_IOACCT, &bp,
- buf_ops);
+ BTOBB(sector_size), 0, &bp, buf_ops);
if (error) {
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
@@ -650,6 +649,15 @@ xfs_agbtree_compute_maxlevels(
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}
+/* Compute maximum possible height for realtime btree types for this fs. */
+static inline void
+xfs_rtbtree_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ mp->m_rtbtree_maxlevels = max(mp->m_rtrmap_maxlevels,
+ mp->m_rtrefc_maxlevels);
+}
+
/*
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
@@ -718,9 +726,12 @@ xfs_mountfs(
xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
xfs_mount_setup_inode_geom(mp);
xfs_rmapbt_compute_maxlevels(mp);
+ xfs_rtrmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
+ xfs_rtrefcountbt_compute_maxlevels(mp);
xfs_agbtree_compute_maxlevels(mp);
+ xfs_rtbtree_compute_maxlevels(mp);
/*
* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index db9dade7d22a..fbed172d6770 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -158,13 +158,20 @@ typedef struct xfs_mount {
uint m_bmap_dmnr[2]; /* min bmap btree records */
uint m_rmap_mxr[2]; /* max rmap btree records */
uint m_rmap_mnr[2]; /* min rmap btree records */
+ uint m_rtrmap_mxr[2]; /* max rtrmap btree records */
+ uint m_rtrmap_mnr[2]; /* min rtrmap btree records */
uint m_refc_mxr[2]; /* max refc btree records */
uint m_refc_mnr[2]; /* min refc btree records */
+ uint m_rtrefc_mxr[2]; /* max rtrefc btree records */
+ uint m_rtrefc_mnr[2]; /* min rtrefc btree records */
uint m_alloc_maxlevels; /* max alloc btree levels */
uint m_bm_maxlevels[2]; /* max bmap btree levels */
uint m_rmap_maxlevels; /* max rmap btree levels */
+ uint m_rtrmap_maxlevels; /* max rtrmap btree level */
uint m_refc_maxlevels; /* max refcount btree level */
+ uint m_rtrefc_maxlevels; /* max rtrefc btree level */
unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */
+ unsigned int m_rtbtree_maxlevels; /* max level of all rt btrees */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
uint m_alloc_set_aside; /* space we can't use */
uint m_ag_max_usable; /* max space per AG */
@@ -350,7 +357,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */
#define __XFS_HAS_FEAT(name, NAME) \
-static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
+static inline bool xfs_has_ ## name (const struct xfs_mount *mp) \
{ \
return mp->m_features & XFS_FEAT_ ## NAME; \
}
@@ -386,18 +393,30 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
-static inline bool xfs_has_rtgroups(struct xfs_mount *mp)
+static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
{
/* all metadir file systems also allow rtgroups */
return xfs_has_metadir(mp);
}
-static inline bool xfs_has_rtsb(struct xfs_mount *mp)
+static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
{
/* all rtgroups filesystems with an rt section have an rtsb */
return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
}
+static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
+{
+ return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) &&
+ xfs_has_rmapbt(mp);
+}
+
+static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
+{
+ return xfs_has_metadir(mp) && xfs_has_realtime(mp) &&
+ xfs_has_reflink(mp);
+}
+
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index fa50e5308292..ed8d8ed42f0a 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -19,6 +19,9 @@
#include "xfs_rtalloc.h"
#include "xfs_trans.h"
#include "xfs_ag.h"
+#include "xfs_notify_failure.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
#include <linux/mm.h>
#include <linux/dax.h>
@@ -154,23 +157,115 @@ xfs_dax_notify_failure_thaw(
}
static int
-xfs_dax_notify_ddev_failure(
+xfs_dax_translate_range(
+ struct xfs_buftarg *btp,
+ u64 offset,
+ u64 len,
+ xfs_daddr_t *daddr,
+ uint64_t *bblen)
+{
+ u64 dev_start = btp->bt_dax_part_off;
+ u64 dev_len = bdev_nr_bytes(btp->bt_bdev);
+ u64 dev_end = dev_start + dev_len - 1;
+
+ /* Notify failure on the whole device. */
+ if (offset == 0 && len == U64_MAX) {
+ offset = dev_start;
+ len = dev_len;
+ }
+
+ /* Ignore the range out of filesystem area */
+ if (offset + len - 1 < dev_start)
+ return -ENXIO;
+ if (offset > dev_end)
+ return -ENXIO;
+
+ /* Calculate the real range when it touches the boundary */
+ if (offset > dev_start)
+ offset -= dev_start;
+ else {
+ len -= dev_start - offset;
+ offset = 0;
+ }
+ if (offset + len - 1 > dev_end)
+ len = dev_end - offset + 1;
+
+ *daddr = BTOBB(offset);
+ *bblen = BTOBB(len);
+ return 0;
+}
+
+static int
+xfs_dax_notify_logdev_failure(
struct xfs_mount *mp,
- xfs_daddr_t daddr,
- xfs_daddr_t bblen,
+ u64 offset,
+ u64 len,
int mf_flags)
{
+ xfs_daddr_t daddr;
+ uint64_t bblen;
+ int error;
+
+ /*
+ * Return ENXIO instead of shutting down the filesystem if the failed
+ * region is beyond the end of the log.
+ */
+ error = xfs_dax_translate_range(mp->m_logdev_targp,
+ offset, len, &daddr, &bblen);
+ if (error)
+ return error;
+
+ /*
+ * In the pre-remove case the failure notification is attempting to
+ * trigger a force unmount. The expectation is that the device is
+ * still present, but its removal is in progress and can not be
+ * cancelled, proceed with accessing the log device.
+ */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ return 0;
+
+ xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+}
+
+static int
+xfs_dax_notify_dev_failure(
+ struct xfs_mount *mp,
+ u64 offset,
+ u64 len,
+ int mf_flags,
+ enum xfs_group_type type)
+{
struct xfs_failure_info notify = { .mf_flags = mf_flags };
struct xfs_trans *tp = NULL;
struct xfs_btree_cur *cur = NULL;
- struct xfs_buf *agf_bp = NULL;
int error = 0;
bool kernel_frozen = false;
- xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
- xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp,
- daddr + bblen - 1);
- xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+ uint32_t start_gno, end_gno;
+ xfs_fsblock_t start_bno, end_bno;
+ xfs_daddr_t daddr;
+ uint64_t bblen;
+ struct xfs_group *xg = NULL;
+
+ if (!xfs_has_rmapbt(mp)) {
+ xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
+ return -EOPNOTSUPP;
+ }
+
+ error = xfs_dax_translate_range(type == XG_TYPE_RTG ?
+ mp->m_rtdev_targp : mp->m_ddev_targp,
+ offset, len, &daddr, &bblen);
+ if (error)
+ return error;
+
+ if (type == XG_TYPE_RTG) {
+ start_bno = xfs_daddr_to_rtb(mp, daddr);
+ end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
+ } else {
+ start_bno = XFS_DADDR_TO_FSB(mp, daddr);
+ end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
+ }
if (mf_flags & MF_MEM_PRE_REMOVE) {
xfs_info(mp, "Device is about to be removed!");
@@ -189,46 +284,58 @@ xfs_dax_notify_ddev_failure(
if (error)
goto out;
- for (; agno <= end_agno; agno++) {
+ start_gno = xfs_fsb_to_gno(mp, start_bno, type);
+ end_gno = xfs_fsb_to_gno(mp, end_bno, type);
+ while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
+ struct xfs_buf *agf_bp = NULL;
+ struct xfs_rtgroup *rtg = NULL;
struct xfs_rmap_irec ri_low = { };
struct xfs_rmap_irec ri_high;
- struct xfs_agf *agf;
- struct xfs_perag *pag;
- xfs_agblock_t range_agend;
- pag = xfs_perag_get(mp, agno);
- error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
- if (error) {
- xfs_perag_put(pag);
- break;
- }
+ if (type == XG_TYPE_AG) {
+ struct xfs_perag *pag = to_perag(xg);
- cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+ if (error) {
+ xfs_perag_put(pag);
+ break;
+ }
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+ } else {
+ rtg = to_rtg(xg);
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ }
/*
* Set the rmap range from ri_low to ri_high, which represents
* a [start, end] where we looking for the files or metadata.
*/
memset(&ri_high, 0xFF, sizeof(ri_high));
- ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
- if (agno == end_agno)
- ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
+ if (xg->xg_gno == start_gno)
+ ri_low.rm_startblock =
+ xfs_fsb_to_gbno(mp, start_bno, type);
+ if (xg->xg_gno == end_gno)
+ ri_high.rm_startblock =
+ xfs_fsb_to_gbno(mp, end_bno, type);
- agf = agf_bp->b_addr;
- range_agend = min(be32_to_cpu(agf->agf_length) - 1,
- ri_high.rm_startblock);
notify.startblock = ri_low.rm_startblock;
- notify.blockcount = range_agend + 1 - ri_low.rm_startblock;
+ notify.blockcount = min(xg->xg_block_count,
+ ri_high.rm_startblock + 1) -
+ ri_low.rm_startblock;
error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
xfs_dax_failure_fn, &notify);
xfs_btree_del_cursor(cur, error);
- xfs_trans_brelse(tp, agf_bp);
- xfs_perag_put(pag);
- if (error)
+ if (agf_bp)
+ xfs_trans_brelse(tp, agf_bp);
+ if (rtg)
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ if (error) {
+ xfs_group_put(xg);
break;
-
- fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
+ }
}
xfs_trans_cancel(tp);
@@ -263,67 +370,20 @@ xfs_dax_notify_failure(
int mf_flags)
{
struct xfs_mount *mp = dax_holder(dax_dev);
- u64 ddev_start;
- u64 ddev_end;
if (!(mp->m_super->s_flags & SB_BORN)) {
xfs_warn(mp, "filesystem is not ready for notify_failure()!");
return -EIO;
}
- if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
- xfs_debug(mp,
- "notify_failure() not supported on realtime device!");
- return -EOPNOTSUPP;
- }
-
- if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
- mp->m_logdev_targp != mp->m_ddev_targp) {
- /*
- * In the pre-remove case the failure notification is attempting
- * to trigger a force unmount. The expectation is that the
- * device is still present, but its removal is in progress and
- * can not be cancelled, proceed with accessing the log device.
- */
- if (mf_flags & MF_MEM_PRE_REMOVE)
- return 0;
- xfs_err(mp, "ondisk log corrupt, shutting down fs!");
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
- return -EFSCORRUPTED;
- }
-
- if (!xfs_has_rmapbt(mp)) {
- xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
- return -EOPNOTSUPP;
- }
-
- ddev_start = mp->m_ddev_targp->bt_dax_part_off;
- ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
-
- /* Notify failure on the whole device. */
- if (offset == 0 && len == U64_MAX) {
- offset = ddev_start;
- len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
- }
-
- /* Ignore the range out of filesystem area */
- if (offset + len - 1 < ddev_start)
- return -ENXIO;
- if (offset > ddev_end)
- return -ENXIO;
-
- /* Calculate the real range when it touches the boundary */
- if (offset > ddev_start)
- offset -= ddev_start;
- else {
- len -= ddev_start - offset;
- offset = 0;
+ if (mp->m_logdev_targp != mp->m_ddev_targp &&
+ mp->m_logdev_targp->bt_daxdev == dax_dev) {
+ return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
}
- if (offset + len - 1 > ddev_end)
- len = ddev_end - offset + 1;
- return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
- mf_flags);
+ return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags,
+ (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ?
+ XG_TYPE_RTG : XG_TYPE_AG);
}
const struct dax_holder_operations xfs_dax_holder_operations = {
diff --git a/fs/xfs/xfs_notify_failure.h b/fs/xfs/xfs_notify_failure.h
new file mode 100644
index 000000000000..8d08ec29dd29
--- /dev/null
+++ b/fs/xfs/xfs_notify_failure.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_NOTIFY_FAILURE_H__
+#define __XFS_NOTIFY_FAILURE_H__
+
+extern const struct dax_holder_operations xfs_dax_holder_operations;
+
+#endif /* __XFS_NOTIFY_FAILURE_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index dc8b1010d4d3..e1ba5af6250f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -230,10 +230,10 @@ xfs_qm_unmount_rt(
if (!rtg)
return;
- if (rtg->rtg_inodes[XFS_RTGI_BITMAP])
- xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_BITMAP]);
- if (rtg->rtg_inodes[XFS_RTGI_SUMMARY])
- xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_SUMMARY]);
+ if (rtg_bitmap(rtg))
+ xfs_qm_dqdetach(rtg_bitmap(rtg));
+ if (rtg_summary(rtg))
+ xfs_qm_dqdetach(rtg_summary(rtg));
xfs_rtgroup_rele(rtg);
}
@@ -428,6 +428,8 @@ void
xfs_qm_dqdetach(
xfs_inode_t *ip)
{
+ if (xfs_is_metadir_inode(ip))
+ return;
if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
return;
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 847ba29630e9..245d754f382a 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -32,21 +32,27 @@ xfs_fill_statvfs_from_dquot(
limit = blkres->softlimit ?
blkres->softlimit :
blkres->hardlimit;
- if (limit && statp->f_blocks > limit) {
- statp->f_blocks = limit;
- statp->f_bfree = statp->f_bavail =
- (statp->f_blocks > blkres->reserved) ?
- (statp->f_blocks - blkres->reserved) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > blkres->reserved)
+ remaining = limit - blkres->reserved;
+
+ statp->f_blocks = min(statp->f_blocks, limit);
+ statp->f_bfree = min(statp->f_bfree, remaining);
}
limit = dqp->q_ino.softlimit ?
dqp->q_ino.softlimit :
dqp->q_ino.hardlimit;
- if (limit && statp->f_files > limit) {
- statp->f_files = limit;
- statp->f_ffree =
- (statp->f_files > dqp->q_ino.reserved) ?
- (statp->f_files - dqp->q_ino.reserved) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dqp->q_ino.reserved)
+ remaining = limit - dqp->q_ino.reserved;
+
+ statp->f_files = min(statp->f_files, limit);
+ statp->f_ffree = min(statp->f_ffree, remaining);
}
}
@@ -72,6 +78,28 @@ xfs_qm_statvfs(
}
}
+STATIC int
+xfs_qm_validate_state_change(
+ struct xfs_mount *mp,
+ uint uqd,
+ uint gqd,
+ uint pqd)
+{
+ int state;
+
+ /* Is quota state changing? */
+ state = ((uqd && !XFS_IS_UQUOTA_ON(mp)) ||
+ (!uqd && XFS_IS_UQUOTA_ON(mp)) ||
+ (gqd && !XFS_IS_GQUOTA_ON(mp)) ||
+ (!gqd && XFS_IS_GQUOTA_ON(mp)) ||
+ (pqd && !XFS_IS_PQUOTA_ON(mp)) ||
+ (!pqd && XFS_IS_PQUOTA_ON(mp)));
+
+ return state &&
+ (xfs_dev_is_read_only(mp, "changing quota state") ||
+ xfs_has_norecovery(mp));
+}
+
int
xfs_qm_newmount(
xfs_mount_t *mp,
@@ -91,24 +119,25 @@ xfs_qm_newmount(
}
/*
- * If the device itself is read-only, we can't allow
- * the user to change the state of quota on the mount -
- * this would generate a transaction on the ro device,
- * which would lead to an I/O error and shutdown
+ * If the device itself is read-only and/or in norecovery
+ * mode, we can't allow the user to change the state of
+ * quota on the mount - this would generate a transaction
+ * on the ro device, which would lead to an I/O error and
+ * shutdown.
*/
- if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
- (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
- (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
- (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) ||
- (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
- (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) &&
- xfs_dev_is_read_only(mp, "changing quota state")) {
- xfs_warn(mp, "please mount with%s%s%s%s.",
- (!quotaondisk ? "out quota" : ""),
- (uquotaondisk ? " usrquota" : ""),
- (gquotaondisk ? " grpquota" : ""),
- (pquotaondisk ? " prjquota" : ""));
+ if (xfs_qm_validate_state_change(mp, uquotaondisk,
+ gquotaondisk, pquotaondisk)) {
+
+ if (xfs_has_metadir(mp))
+ xfs_warn(mp,
+ "metadir enabled, please mount without any quota mount options");
+ else
+ xfs_warn(mp, "please mount with%s%s%s%s.",
+ (!quotaondisk ? "out quota" : ""),
+ (uquotaondisk ? " usrquota" : ""),
+ (gquotaondisk ? " grpquota" : ""),
+ (pquotaondisk ? " prjquota" : ""));
return -EPERM;
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index d7565462af3d..105e6eb57620 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -29,11 +29,6 @@ struct xfs_buf;
(XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
(XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
-#define XFS_IS_DQDETACHED(ip) \
- ((ip)->i_udquot == NULL && \
- (ip)->i_gdquot == NULL && \
- (ip)->i_pdquot == NULL)
-
#define XFS_QM_NEED_QUOTACHECK(mp) \
((XFS_IS_UQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index bede1c96c330..fe2d7aab8554 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -23,6 +23,7 @@
#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_trace.h"
+#include "xfs_rtgroup.h"
struct kmem_cache *xfs_cui_cache;
struct kmem_cache *xfs_cud_cache;
@@ -94,8 +95,9 @@ xfs_cui_item_format(
ASSERT(atomic_read(&cuip->cui_next_extent) ==
cuip->cui_format.cui_nextents);
+ ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT);
- cuip->cui_format.cui_type = XFS_LI_CUI;
+ cuip->cui_format.cui_type = lip->li_type;
cuip->cui_format.cui_size = 1;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format,
@@ -138,12 +140,14 @@ xfs_cui_item_release(
STATIC struct xfs_cui_log_item *
xfs_cui_init(
struct xfs_mount *mp,
+ unsigned short item_type,
uint nextents)
-
{
struct xfs_cui_log_item *cuip;
ASSERT(nextents > 0);
+ ASSERT(item_type == XFS_LI_CUI || item_type == XFS_LI_CUI_RT);
+
if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
cuip = kzalloc(xfs_cui_log_item_sizeof(nextents),
GFP_KERNEL | __GFP_NOFAIL);
@@ -151,7 +155,7 @@ xfs_cui_init(
cuip = kmem_cache_zalloc(xfs_cui_cache,
GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+ xfs_log_item_init(mp, &cuip->cui_item, item_type, &xfs_cui_item_ops);
cuip->cui_format.cui_nextents = nextents;
cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
atomic_set(&cuip->cui_next_extent, 0);
@@ -190,7 +194,9 @@ xfs_cud_item_format(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
struct xfs_log_iovec *vecp = NULL;
- cudp->cud_format.cud_type = XFS_LI_CUD;
+ ASSERT(lip->li_type == XFS_LI_CUD || lip->li_type == XFS_LI_CUD_RT);
+
+ cudp->cud_format.cud_type = lip->li_type;
cudp->cud_format.cud_size = 1;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format,
@@ -234,6 +240,14 @@ static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e)
return list_entry(e, struct xfs_refcount_intent, ri_list);
}
+static inline bool
+xfs_cui_item_isrt(const struct xfs_log_item *lip)
+{
+ ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT);
+
+ return lip->li_type == XFS_LI_CUI_RT;
+}
+
/* Sort refcount intents by AG. */
static int
xfs_refcount_update_diff_items(
@@ -282,18 +296,20 @@ xfs_refcount_update_log_item(
}
static struct xfs_log_item *
-xfs_refcount_update_create_intent(
+__xfs_refcount_update_create_intent(
struct xfs_trans *tp,
struct list_head *items,
unsigned int count,
- bool sort)
+ bool sort,
+ unsigned short item_type)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count);
+ struct xfs_cui_log_item *cuip;
struct xfs_refcount_intent *ri;
ASSERT(count > 0);
+ cuip = xfs_cui_init(mp, item_type, count);
if (sort)
list_sort(mp, items, xfs_refcount_update_diff_items);
list_for_each_entry(ri, items, ri_list)
@@ -301,6 +317,23 @@ xfs_refcount_update_create_intent(
return &cuip->cui_item;
}
+static struct xfs_log_item *
+xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return __xfs_refcount_update_create_intent(tp, items, count, sort,
+ XFS_LI_CUI);
+}
+
+static inline unsigned short
+xfs_cud_type_from_cui(const struct xfs_cui_log_item *cuip)
+{
+ return xfs_cui_item_isrt(&cuip->cui_item) ? XFS_LI_CUD_RT : XFS_LI_CUD;
+}
+
/* Get an CUD so we can process all the deferred refcount updates. */
static struct xfs_log_item *
xfs_refcount_update_create_done(
@@ -312,8 +345,8 @@ xfs_refcount_update_create_done(
struct xfs_cud_log_item *cudp;
cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
- &xfs_cud_item_ops);
+ xfs_log_item_init(tp->t_mountp, &cudp->cud_item,
+ xfs_cud_type_from_cui(cuip), &xfs_cud_item_ops);
cudp->cud_cuip = cuip;
cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
@@ -328,10 +361,20 @@ xfs_refcount_defer_add(
{
struct xfs_mount *mp = tp->t_mountp;
- trace_xfs_refcount_defer(mp, ri);
+ /*
+ * Deferred refcount updates for the realtime and data sections must
+ * use separate transactions to finish deferred work because updates to
+ * realtime metadata files can lock AGFs to allocate btree blocks and
+ * we don't want that mixing with the AGF locks taken to finish data
+ * section updates.
+ */
+ ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock,
+ ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG);
- ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, XG_TYPE_AG);
- xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
+ trace_xfs_refcount_defer(mp, ri);
+ xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ?
+ &xfs_rtrefcount_update_defer_type :
+ &xfs_refcount_update_defer_type);
}
/* Cancel a deferred refcount update. */
@@ -381,7 +424,7 @@ xfs_refcount_finish_one_cleanup(
return;
agbp = rcur->bc_ag.agbp;
xfs_btree_del_cursor(rcur, error);
- if (error)
+ if (error && agbp)
xfs_trans_brelse(tp, agbp);
}
@@ -397,6 +440,7 @@ xfs_refcount_update_abort_intent(
static inline bool
xfs_cui_validate_phys(
struct xfs_mount *mp,
+ bool isrt,
struct xfs_phys_extent *pmap)
{
if (!xfs_has_reflink(mp))
@@ -415,6 +459,9 @@ xfs_cui_validate_phys(
return false;
}
+ if (isrt)
+ return xfs_verify_rtbext(mp, pmap->pe_startblock, pmap->pe_len);
+
return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
}
@@ -422,6 +469,7 @@ static inline void
xfs_cui_recover_work(
struct xfs_mount *mp,
struct xfs_defer_pending *dfp,
+ bool isrt,
struct xfs_phys_extent *pmap)
{
struct xfs_refcount_intent *ri;
@@ -432,7 +480,8 @@ xfs_cui_recover_work(
ri->ri_startblock = pmap->pe_startblock;
ri->ri_blockcount = pmap->pe_len;
ri->ri_group = xfs_group_intent_get(mp, pmap->pe_startblock,
- XG_TYPE_AG);
+ isrt ? XG_TYPE_RTG : XG_TYPE_AG);
+ ri->ri_realtime = isrt;
xfs_defer_add_item(dfp, &ri->ri_list);
}
@@ -451,6 +500,7 @@ xfs_refcount_recover_work(
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_mount *mp = lip->li_log->l_mp;
+ bool isrt = xfs_cui_item_isrt(lip);
int i;
int error = 0;
@@ -460,7 +510,7 @@ xfs_refcount_recover_work(
* just toss the CUI.
*/
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
- if (!xfs_cui_validate_phys(mp,
+ if (!xfs_cui_validate_phys(mp, isrt,
&cuip->cui_format.cui_extents[i])) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
&cuip->cui_format,
@@ -468,7 +518,8 @@ xfs_refcount_recover_work(
return -EFSCORRUPTED;
}
- xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]);
+ xfs_cui_recover_work(mp, dfp, isrt,
+ &cuip->cui_format.cui_extents[i]);
}
/*
@@ -515,10 +566,13 @@ xfs_refcount_relog_intent(
struct xfs_phys_extent *pmap;
unsigned int count;
+ ASSERT(intent->li_type == XFS_LI_CUI ||
+ intent->li_type == XFS_LI_CUI_RT);
+
count = CUI_ITEM(intent)->cui_format.cui_nextents;
pmap = CUI_ITEM(intent)->cui_format.cui_extents;
- cuip = xfs_cui_init(tp->t_mountp, count);
+ cuip = xfs_cui_init(tp->t_mountp, intent->li_type, count);
memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
atomic_set(&cuip->cui_next_extent, count);
@@ -538,6 +592,71 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
.relog_intent = xfs_refcount_relog_intent,
};
+#ifdef CONFIG_XFS_RT
+static struct xfs_log_item *
+xfs_rtrefcount_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return __xfs_refcount_update_create_intent(tp, items, count, sort,
+ XFS_LI_CUI_RT);
+}
+
+/* Process a deferred realtime refcount update. */
+STATIC int
+xfs_rtrefcount_update_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_refcount_intent *ri = ci_entry(item);
+ int error;
+
+ error = xfs_rtrefcount_finish_one(tp, ri, state);
+
+ /* Did we run out of reservation? Requeue what we didn't finish. */
+ if (!error && ri->ri_blockcount > 0) {
+ ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
+ ri->ri_type == XFS_REFCOUNT_DECREASE);
+ return -EAGAIN;
+ }
+
+ xfs_refcount_update_cancel_item(item);
+ return error;
+}
+
+/* Clean up after calling xfs_rtrefcount_finish_one. */
+STATIC void
+xfs_rtrefcount_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
+{
+ if (rcur)
+ xfs_btree_del_cursor(rcur, error);
+}
+
+const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = {
+ .name = "rtrefcount",
+ .max_items = XFS_CUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_rtrefcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+ .create_done = xfs_refcount_update_create_done,
+ .finish_item = xfs_rtrefcount_update_finish_item,
+ .finish_cleanup = xfs_rtrefcount_finish_one_cleanup,
+ .cancel_item = xfs_refcount_update_cancel_item,
+ .recover_work = xfs_refcount_recover_work,
+ .relog_intent = xfs_refcount_relog_intent,
+};
+#else
+const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = {
+ .name = "rtrefcount",
+};
+#endif /* CONFIG_XFS_RT */
+
STATIC bool
xfs_cui_item_match(
struct xfs_log_item *lip,
@@ -603,7 +722,7 @@ xlog_recover_cui_commit_pass2(
return -EFSCORRUPTED;
}
- cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+ cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents);
xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
@@ -617,6 +736,61 @@ const struct xlog_recover_item_ops xlog_cui_item_ops = {
.commit_pass2 = xlog_recover_cui_commit_pass2,
};
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtcui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_cui_log_format *cui_formatp;
+ size_t len;
+
+ cui_formatp = item->ri_buf[0].i_addr;
+
+ if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents);
+ xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
+ atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+
+ xlog_recover_intent_item(log, &cuip->cui_item, lsn,
+ &xfs_rtrefcount_update_defer_type);
+ return 0;
+}
+#else
+STATIC int
+xlog_recover_rtcui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+}
+#endif
+
+const struct xlog_recover_item_ops xlog_rtcui_item_ops = {
+ .item_type = XFS_LI_CUI_RT,
+ .commit_pass2 = xlog_recover_rtcui_commit_pass2,
+};
+
/*
* This routine is called when an CUD format structure is found in a committed
* transaction in the log. Its purpose is to cancel the corresponding CUI if it
@@ -648,3 +822,33 @@ const struct xlog_recover_item_ops xlog_cud_item_ops = {
.item_type = XFS_LI_CUD,
.commit_pass2 = xlog_recover_cud_commit_pass2,
};
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtcud_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_cud_log_format *cud_formatp;
+
+ cud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_CUI_RT,
+ cud_formatp->cud_cui_id);
+ return 0;
+}
+#else
+# define xlog_recover_rtcud_commit_pass2 xlog_recover_rtcui_commit_pass2
+#endif
+
+const struct xlog_recover_item_ops xlog_rtcud_item_ops = {
+ .item_type = XFS_LI_CUD_RT,
+ .commit_pass2 = xlog_recover_rtcud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b11769c009ef..59f7fc16eb80 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -30,6 +30,10 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_health.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
/*
* Copy on Write of Shared Blocks
@@ -120,38 +124,93 @@
*/
/*
- * Given an AG extent, find the lowest-numbered run of shared blocks
- * within that range and return the range in fbno/flen. If
- * find_end_of_shared is true, return the longest contiguous extent of
- * shared blocks. If there are no shared extents, fbno and flen will
- * be set to NULLAGBLOCK and 0, respectively.
+ * Given a file mapping for the data device, find the lowest-numbered run of
+ * shared blocks within that mapping and return it in shared_offset/shared_len.
+ * The offset is relative to the start of irec.
+ *
+ * If find_end_of_shared is true, return the longest contiguous extent of shared
+ * blocks. If there are no shared extents, shared_offset and shared_len will be
+ * set to 0;
*/
static int
xfs_reflink_find_shared(
- struct xfs_perag *pag,
+ struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- xfs_agblock_t *fbno,
- xfs_extlen_t *flen,
+ const struct xfs_bmbt_irec *irec,
+ xfs_extlen_t *shared_offset,
+ xfs_extlen_t *shared_len,
bool find_end_of_shared)
{
struct xfs_buf *agbp;
+ struct xfs_perag *pag;
struct xfs_btree_cur *cur;
int error;
+ xfs_agblock_t orig_bno, found_bno;
+
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+ orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
- return error;
+ goto out;
- cur = xfs_refcountbt_init_cursor(pag_mount(pag), tp, agbp, pag);
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+ error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount,
+ &found_bno, shared_len, find_end_of_shared);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(tp, agbp);
- error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
- find_end_of_shared);
+ if (!error && *shared_len)
+ *shared_offset = found_bno - orig_bno;
+out:
+ xfs_perag_put(pag);
+ return error;
+}
+/*
+ * Given a file mapping for the rt device, find the lowest-numbered run of
+ * shared blocks within that mapping and return it in shared_offset/shared_len.
+ * The offset is relative to the start of irec.
+ *
+ * If find_end_of_shared is true, return the longest contiguous extent of shared
+ * blocks. If there are no shared extents, shared_offset and shared_len will be
+ * set to 0;
+ */
+static int
+xfs_reflink_find_rtshared(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_bmbt_irec *irec,
+ xfs_extlen_t *shared_offset,
+ xfs_extlen_t *shared_len,
+ bool find_end_of_shared)
+{
+ struct xfs_rtgroup *rtg;
+ struct xfs_btree_cur *cur;
+ xfs_rgblock_t orig_bno;
+ xfs_agblock_t found_bno;
+ int error;
+
+ BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK);
+
+ /*
+ * Note: this uses the not quite correct xfs_agblock_t type because
+ * xfs_refcount_find_shared is shared between the RT and data device
+ * refcount code.
+ */
+ orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock);
+ rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, irec->br_startblock));
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT);
+ cur = xfs_rtrefcountbt_init_cursor(tp, rtg);
+ error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount,
+ &found_bno, shared_len, find_end_of_shared);
xfs_btree_del_cursor(cur, error);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT);
+ xfs_rtgroup_put(rtg);
- xfs_trans_brelse(tp, agbp);
+ if (!error && *shared_len)
+ *shared_offset = found_bno - orig_bno;
return error;
}
@@ -172,11 +231,7 @@ xfs_reflink_trim_around_shared(
bool *shared)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
+ xfs_extlen_t shared_offset, shared_len;
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
@@ -187,41 +242,37 @@ xfs_reflink_trim_around_shared(
trace_xfs_reflink_trim_around_shared(ip, irec);
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
- agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
- aglen = irec->br_blockcount;
-
- error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
- true);
- xfs_perag_put(pag);
+ if (XFS_IS_REALTIME_INODE(ip))
+ error = xfs_reflink_find_rtshared(mp, NULL, irec,
+ &shared_offset, &shared_len, true);
+ else
+ error = xfs_reflink_find_shared(mp, NULL, irec,
+ &shared_offset, &shared_len, true);
if (error)
return error;
- *shared = false;
- if (fbno == NULLAGBLOCK) {
+ if (!shared_len) {
/* No shared blocks at all. */
- return 0;
- }
-
- if (fbno == agbno) {
+ *shared = false;
+ } else if (!shared_offset) {
/*
- * The start of this extent is shared. Truncate the
- * mapping at the end of the shared region so that a
- * subsequent iteration starts at the start of the
- * unshared region.
+ * The start of this mapping points to shared space. Truncate
+ * the mapping at the end of the shared region so that a
+ * subsequent iteration starts at the start of the unshared
+ * region.
*/
- irec->br_blockcount = flen;
+ irec->br_blockcount = shared_len;
*shared = true;
- return 0;
+ } else {
+ /*
+ * There's a shared region that doesn't start at the beginning
+ * of the mapping. Truncate the mapping at the start of the
+ * shared extent so that a subsequent iteration starts at the
+ * start of the shared region.
+ */
+ irec->br_blockcount = shared_offset;
+ *shared = false;
}
-
- /*
- * There's a shared extent midway through this extent.
- * Truncate the mapping at the start of the shared
- * extent so that a subsequent iteration starts at the
- * start of the shared region.
- */
- irec->br_blockcount = fbno - agbno;
return 0;
}
@@ -389,20 +440,26 @@ xfs_reflink_fill_cow_hole(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
xfs_filblks_t resaligned;
- xfs_extlen_t resblks;
+ unsigned int dblocks = 0, rblocks = 0;
int nimaps;
int error;
bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resaligned;
+ } else {
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ rblocks = 0;
+ }
xfs_iunlock(ip, *lockmode);
*lockmode = 0;
- error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
- false, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+ rblocks, false, &tp);
if (error)
return error;
@@ -571,6 +628,7 @@ xfs_reflink_cancel_cow_blocks(
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
+ bool isrt = XFS_IS_REALTIME_INODE(ip);
int error = 0;
if (!xfs_inode_has_cow_data(ip))
@@ -598,12 +656,13 @@ xfs_reflink_cancel_cow_blocks(
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
/* Free the CoW orphan record. */
- xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
- del.br_blockcount);
+ xfs_refcount_free_cow_extent(*tpp, isrt,
+ del.br_startblock, del.br_blockcount);
error = xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ XFS_AG_RESV_NONE,
+ isrt ? XFS_FREE_EXTENT_REALTIME : 0);
if (error)
break;
@@ -687,6 +746,35 @@ out:
return error;
}
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * Update quota accounting for a remapping operation. When we're remapping
+ * something from the CoW fork to the data fork, we must update the quota
+ * accounting for delayed allocations. For remapping from the data fork to the
+ * data fork, use regular block accounting.
+ */
+static inline void
+xfs_reflink_update_quota(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ bool is_cow,
+ int64_t blocks)
+{
+ unsigned int qflag;
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT :
+ XFS_TRANS_DQ_RTBCOUNT;
+ } else {
+ qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT :
+ XFS_TRANS_DQ_BCOUNT;
+ }
+ xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks);
+}
+#else
+# define xfs_reflink_update_quota(tp, ip, is_cow, blocks) ((void)0)
+#endif
+
/*
* Remap part of the CoW fork into the data fork.
*
@@ -710,6 +798,7 @@ xfs_reflink_end_cow_extent(
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
unsigned int resblks;
int nmaps;
+ bool isrt = XFS_IS_REALTIME_INODE(ip);
int error;
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
@@ -779,9 +868,8 @@ xfs_reflink_end_cow_extent(
* or not), unmap the extent and drop its refcount.
*/
xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
- xfs_refcount_decrease_extent(tp, &data);
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
- -data.br_blockcount);
+ xfs_refcount_decrease_extent(tp, isrt, &data);
+ xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount);
} else if (data.br_startblock == DELAYSTARTBLOCK) {
int done;
@@ -799,14 +887,14 @@ xfs_reflink_end_cow_extent(
}
/* Free the CoW orphan record. */
- xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
+ xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock,
+ del.br_blockcount);
/* Map the new blocks into the data fork. */
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
/* Charge this new data fork mapping to the on-disk quota. */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
- (long)del.br_blockcount);
+ xfs_reflink_update_quota(tp, ip, true, del.br_blockcount);
/* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
@@ -895,20 +983,29 @@ xfs_reflink_recover_cow(
struct xfs_mount *mp)
{
struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
int error = 0;
if (!xfs_has_reflink(mp))
return 0;
while ((pag = xfs_perag_next(mp, pag))) {
- error = xfs_refcount_recover_cow_leftovers(mp, pag);
+ error = xfs_refcount_recover_cow_leftovers(pag_group(pag));
if (error) {
xfs_perag_rele(pag);
- break;
+ return error;
}
}
- return error;
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ error = xfs_refcount_recover_cow_leftovers(rtg_group(rtg));
+ if (error) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
+
+ return 0;
}
/*
@@ -1100,14 +1197,28 @@ out_error:
static int
xfs_reflink_ag_has_free_space(
struct xfs_mount *mp,
- xfs_agnumber_t agno)
+ struct xfs_inode *ip,
+ xfs_fsblock_t fsb)
{
struct xfs_perag *pag;
+ xfs_agnumber_t agno;
int error = 0;
if (!xfs_has_rmapbt(mp))
return 0;
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ struct xfs_rtgroup *rtg;
+ xfs_rgnumber_t rgno;
+
+ rgno = xfs_rtb_to_rgno(mp, fsb);
+ rtg = xfs_rtgroup_get(mp, rgno);
+ if (xfs_metafile_resv_critical(rtg_rmap(rtg)))
+ error = -ENOSPC;
+ xfs_rtgroup_put(rtg);
+ return error;
+ }
+ agno = XFS_FSB_TO_AGNO(mp, fsb);
pag = xfs_perag_get(mp, agno);
if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
@@ -1131,10 +1242,11 @@ xfs_reflink_remap_extent(
struct xfs_trans *tp;
xfs_off_t newlen;
int64_t qdelta = 0;
- unsigned int resblks;
+ unsigned int dblocks, rblocks, resblks;
bool quota_reserved = true;
bool smap_real;
bool dmap_written = xfs_bmap_is_written_extent(dmap);
+ bool isrt = XFS_IS_REALTIME_INODE(ip);
int iext_delta = 0;
int nimaps;
int error;
@@ -1161,8 +1273,15 @@ xfs_reflink_remap_extent(
* we're remapping.
*/
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ dblocks = resblks;
+ rblocks = dmap->br_blockcount;
+ } else {
+ dblocks = resblks + dmap->br_blockcount;
+ rblocks = 0;
+ }
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
- resblks + dmap->br_blockcount, 0, false, &tp);
+ dblocks, rblocks, false, &tp);
if (error == -EDQUOT || error == -ENOSPC) {
quota_reserved = false;
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
@@ -1213,8 +1332,8 @@ xfs_reflink_remap_extent(
/* No reflinking if the AG of the dest mapping is low on space. */
if (dmap_written) {
- error = xfs_reflink_ag_has_free_space(mp,
- XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
+ error = xfs_reflink_ag_has_free_space(mp, ip,
+ dmap->br_startblock);
if (error)
goto out_cancel;
}
@@ -1242,8 +1361,15 @@ xfs_reflink_remap_extent(
* done.
*/
if (!quota_reserved && !smap_real && dmap_written) {
- error = xfs_trans_reserve_quota_nblks(tp, ip,
- dmap->br_blockcount, 0, false);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ dblocks = 0;
+ rblocks = dmap->br_blockcount;
+ } else {
+ dblocks = dmap->br_blockcount;
+ rblocks = 0;
+ }
+ error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
+ false);
if (error)
goto out_cancel;
}
@@ -1264,7 +1390,7 @@ xfs_reflink_remap_extent(
* or not), unmap the extent and drop its refcount.
*/
xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
- xfs_refcount_decrease_extent(tp, &smap);
+ xfs_refcount_decrease_extent(tp, isrt, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
int done;
@@ -1287,12 +1413,12 @@ xfs_reflink_remap_extent(
* its refcount and map it into the file.
*/
if (dmap_written) {
- xfs_refcount_increase_extent(tp, dmap);
+ xfs_refcount_increase_extent(tp, isrt, dmap);
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
qdelta += dmap->br_blockcount;
}
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
+ xfs_reflink_update_quota(tp, ip, false, qdelta);
/* Update dest isize if needed. */
newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
@@ -1466,8 +1592,8 @@ xfs_reflink_remap_prep(
/* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
- /* Don't reflink realtime inodes */
- if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+ /* Can't reflink between data and rt volumes */
+ if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest))
goto out_unlock;
/* Don't share DAX file data with non-DAX file. */
@@ -1547,27 +1673,23 @@ xfs_reflink_inode_has_shared_extents(
*has_shared = false;
found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
while (found) {
- struct xfs_perag *pag;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t rbno;
- xfs_extlen_t rlen;
+ xfs_extlen_t shared_offset, shared_len;
if (isnullstartblock(got.br_startblock) ||
got.br_state != XFS_EXT_NORM)
goto next;
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
- agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
- aglen = got.br_blockcount;
- error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
- &rbno, &rlen, false);
- xfs_perag_put(pag);
+ if (XFS_IS_REALTIME_INODE(ip))
+ error = xfs_reflink_find_rtshared(mp, tp, &got,
+ &shared_offset, &shared_len, false);
+ else
+ error = xfs_reflink_find_shared(mp, tp, &got,
+ &shared_offset, &shared_len, false);
if (error)
return error;
/* Is there still a shared block here? */
- if (rbno != NULLAGBLOCK) {
+ if (shared_len) {
*has_shared = true;
return 0;
}
@@ -1700,3 +1822,28 @@ out:
trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
return error;
}
+
+/*
+ * Can we use reflink with this realtime extent size? Note that we don't check
+ * for rblocks > 0 here because this can be called as part of attaching a new
+ * rt section.
+ */
+bool
+xfs_reflink_supports_rextsize(
+ struct xfs_mount *mp,
+ unsigned int rextsize)
+{
+ /* reflink on the realtime device requires rtgroups */
+ if (!xfs_has_rtgroups(mp))
+ return false;
+
+ /*
+ * Reflink doesn't support rt extent size larger than a single fsblock
+ * because we would have to perform CoW-around for unaligned write
+ * requests to guarantee that we always remap entire rt extents.
+ */
+ if (rextsize != 1)
+ return false;
+
+ return true;
+}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 4a58e4533671..cc4e92278279 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -25,7 +25,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip)
return true;
}
-extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
+int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool *shared);
@@ -62,4 +62,6 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
xfs_extlen_t cowextsize, unsigned int remap_flags);
+bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize);
+
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 76b3c0ed3b4f..89decffe76c8 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -23,6 +23,7 @@
#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_trace.h"
+#include "xfs_rtgroup.h"
struct kmem_cache *xfs_rui_cache;
struct kmem_cache *xfs_rud_cache;
@@ -94,7 +95,9 @@ xfs_rui_item_format(
ASSERT(atomic_read(&ruip->rui_next_extent) ==
ruip->rui_format.rui_nextents);
- ruip->rui_format.rui_type = XFS_LI_RUI;
+ ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT);
+
+ ruip->rui_format.rui_type = lip->li_type;
ruip->rui_format.rui_size = 1;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
@@ -137,12 +140,15 @@ xfs_rui_item_release(
STATIC struct xfs_rui_log_item *
xfs_rui_init(
struct xfs_mount *mp,
+ unsigned short item_type,
uint nextents)
{
struct xfs_rui_log_item *ruip;
ASSERT(nextents > 0);
+ ASSERT(item_type == XFS_LI_RUI || item_type == XFS_LI_RUI_RT);
+
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
ruip = kzalloc(xfs_rui_log_item_sizeof(nextents),
GFP_KERNEL | __GFP_NOFAIL);
@@ -150,7 +156,7 @@ xfs_rui_init(
ruip = kmem_cache_zalloc(xfs_rui_cache,
GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
+ xfs_log_item_init(mp, &ruip->rui_item, item_type, &xfs_rui_item_ops);
ruip->rui_format.rui_nextents = nextents;
ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
atomic_set(&ruip->rui_next_extent, 0);
@@ -189,7 +195,9 @@ xfs_rud_item_format(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
struct xfs_log_iovec *vecp = NULL;
- rudp->rud_format.rud_type = XFS_LI_RUD;
+ ASSERT(lip->li_type == XFS_LI_RUD || lip->li_type == XFS_LI_RUD_RT);
+
+ rudp->rud_format.rud_type = lip->li_type;
rudp->rud_format.rud_size = 1;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format,
@@ -233,6 +241,14 @@ static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e)
return list_entry(e, struct xfs_rmap_intent, ri_list);
}
+static inline bool
+xfs_rui_item_isrt(const struct xfs_log_item *lip)
+{
+ ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT);
+
+ return lip->li_type == XFS_LI_RUI_RT;
+}
+
/* Sort rmap intents by AG. */
static int
xfs_rmap_update_diff_items(
@@ -305,18 +321,20 @@ xfs_rmap_update_log_item(
}
static struct xfs_log_item *
-xfs_rmap_update_create_intent(
+__xfs_rmap_update_create_intent(
struct xfs_trans *tp,
struct list_head *items,
unsigned int count,
- bool sort)
+ bool sort,
+ unsigned short item_type)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count);
+ struct xfs_rui_log_item *ruip;
struct xfs_rmap_intent *ri;
ASSERT(count > 0);
+ ruip = xfs_rui_init(mp, item_type, count);
if (sort)
list_sort(mp, items, xfs_rmap_update_diff_items);
list_for_each_entry(ri, items, ri_list)
@@ -324,6 +342,23 @@ xfs_rmap_update_create_intent(
return &ruip->rui_item;
}
+static struct xfs_log_item *
+xfs_rmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return __xfs_rmap_update_create_intent(tp, items, count, sort,
+ XFS_LI_RUI);
+}
+
+static inline unsigned short
+xfs_rud_type_from_rui(const struct xfs_rui_log_item *ruip)
+{
+ return xfs_rui_item_isrt(&ruip->rui_item) ? XFS_LI_RUD_RT : XFS_LI_RUD;
+}
+
/* Get an RUD so we can process all the deferred rmap updates. */
static struct xfs_log_item *
xfs_rmap_update_create_done(
@@ -335,8 +370,8 @@ xfs_rmap_update_create_done(
struct xfs_rud_log_item *rudp;
rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
- &xfs_rud_item_ops);
+ xfs_log_item_init(tp->t_mountp, &rudp->rud_item,
+ xfs_rud_type_from_rui(ruip), &xfs_rud_item_ops);
rudp->rud_ruip = ruip;
rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
@@ -351,11 +386,20 @@ xfs_rmap_defer_add(
{
struct xfs_mount *mp = tp->t_mountp;
- trace_xfs_rmap_defer(mp, ri);
-
+ /*
+ * Deferred rmap updates for the realtime and data sections must use
+ * separate transactions to finish deferred work because updates to
+ * realtime metadata files can lock AGFs to allocate btree blocks and
+ * we don't want that mixing with the AGF locks taken to finish data
+ * section updates.
+ */
ri->ri_group = xfs_group_intent_get(mp, ri->ri_bmap.br_startblock,
- XG_TYPE_AG);
- xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
+ ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG);
+
+ trace_xfs_rmap_defer(mp, ri);
+ xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ?
+ &xfs_rtrmap_update_defer_type :
+ &xfs_rmap_update_defer_type);
}
/* Cancel a deferred rmap update. */
@@ -415,6 +459,7 @@ xfs_rmap_update_abort_intent(
static inline bool
xfs_rui_validate_map(
struct xfs_mount *mp,
+ bool isrt,
struct xfs_map_extent *map)
{
if (!xfs_has_rmapbt(mp))
@@ -444,6 +489,9 @@ xfs_rui_validate_map(
if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
return false;
+ if (isrt)
+ return xfs_verify_rtbext(mp, map->me_startblock, map->me_len);
+
return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
@@ -451,6 +499,7 @@ static inline void
xfs_rui_recover_work(
struct xfs_mount *mp,
struct xfs_defer_pending *dfp,
+ bool isrt,
const struct xfs_map_extent *map)
{
struct xfs_rmap_intent *ri;
@@ -495,7 +544,9 @@ xfs_rui_recover_work(
ri->ri_bmap.br_blockcount = map->me_len;
ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
- ri->ri_group = xfs_group_intent_get(mp, map->me_startblock, XG_TYPE_AG);
+ ri->ri_group = xfs_group_intent_get(mp, map->me_startblock,
+ isrt ? XG_TYPE_RTG : XG_TYPE_AG);
+ ri->ri_realtime = isrt;
xfs_defer_add_item(dfp, &ri->ri_list);
}
@@ -514,6 +565,7 @@ xfs_rmap_recover_work(
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_mount *mp = lip->li_log->l_mp;
+ bool isrt = xfs_rui_item_isrt(lip);
int i;
int error = 0;
@@ -523,7 +575,7 @@ xfs_rmap_recover_work(
* just toss the RUI.
*/
for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
- if (!xfs_rui_validate_map(mp,
+ if (!xfs_rui_validate_map(mp, isrt,
&ruip->rui_format.rui_extents[i])) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
&ruip->rui_format,
@@ -531,7 +583,8 @@ xfs_rmap_recover_work(
return -EFSCORRUPTED;
}
- xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]);
+ xfs_rui_recover_work(mp, dfp, isrt,
+ &ruip->rui_format.rui_extents[i]);
}
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -566,10 +619,13 @@ xfs_rmap_relog_intent(
struct xfs_map_extent *map;
unsigned int count;
+ ASSERT(intent->li_type == XFS_LI_RUI ||
+ intent->li_type == XFS_LI_RUI_RT);
+
count = RUI_ITEM(intent)->rui_format.rui_nextents;
map = RUI_ITEM(intent)->rui_format.rui_extents;
- ruip = xfs_rui_init(tp->t_mountp, count);
+ ruip = xfs_rui_init(tp->t_mountp, intent->li_type, count);
memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
atomic_set(&ruip->rui_next_extent, count);
@@ -589,6 +645,47 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
.relog_intent = xfs_rmap_relog_intent,
};
+#ifdef CONFIG_XFS_RT
+static struct xfs_log_item *
+xfs_rtrmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return __xfs_rmap_update_create_intent(tp, items, count, sort,
+ XFS_LI_RUI_RT);
+}
+
+/* Clean up after calling xfs_rmap_finish_one. */
+STATIC void
+xfs_rtrmap_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
+{
+ if (rcur)
+ xfs_btree_del_cursor(rcur, error);
+}
+
+const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = {
+ .name = "rtrmap",
+ .max_items = XFS_RUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_rtrmap_update_create_intent,
+ .abort_intent = xfs_rmap_update_abort_intent,
+ .create_done = xfs_rmap_update_create_done,
+ .finish_item = xfs_rmap_update_finish_item,
+ .finish_cleanup = xfs_rtrmap_finish_one_cleanup,
+ .cancel_item = xfs_rmap_update_cancel_item,
+ .recover_work = xfs_rmap_recover_work,
+ .relog_intent = xfs_rmap_relog_intent,
+};
+#else
+const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = {
+ .name = "rtrmap",
+};
+#endif
+
STATIC bool
xfs_rui_item_match(
struct xfs_log_item *lip,
@@ -654,7 +751,7 @@ xlog_recover_rui_commit_pass2(
return -EFSCORRUPTED;
}
- ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+ ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents);
xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
@@ -668,6 +765,61 @@ const struct xlog_recover_item_ops xlog_rui_item_ops = {
.commit_pass2 = xlog_recover_rui_commit_pass2,
};
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtrui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_rui_log_item *ruip;
+ struct xfs_rui_log_format *rui_formatp;
+ size_t len;
+
+ rui_formatp = item->ri_buf[0].i_addr;
+
+ if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents);
+ xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
+ atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
+
+ xlog_recover_intent_item(log, &ruip->rui_item, lsn,
+ &xfs_rtrmap_update_defer_type);
+ return 0;
+}
+#else
+STATIC int
+xlog_recover_rtrui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+}
+#endif
+
+const struct xlog_recover_item_ops xlog_rtrui_item_ops = {
+ .item_type = XFS_LI_RUI_RT,
+ .commit_pass2 = xlog_recover_rtrui_commit_pass2,
+};
+
/*
* This routine is called when an RUD format structure is found in a committed
* transaction in the log. Its purpose is to cancel the corresponding RUI if it
@@ -699,3 +851,33 @@ const struct xlog_recover_item_ops xlog_rud_item_ops = {
.item_type = XFS_LI_RUD,
.commit_pass2 = xlog_recover_rud_commit_pass2,
};
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtrud_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_rud_log_format *rud_formatp;
+
+ rud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ rud_formatp, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_RUI_RT,
+ rud_formatp->rud_rui_id);
+ return 0;
+}
+#else
+# define xlog_recover_rtrud_commit_pass2 xlog_recover_rtrui_commit_pass2
+#endif
+
+const struct xlog_recover_item_ops xlog_rtrud_item_ops = {
+ .item_type = XFS_LI_RUD_RT,
+ .commit_pass2 = xlog_recover_rtrud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index fcfa6e0eb3ad..57bef567e011 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -22,6 +22,7 @@
#include "xfs_rtalloc.h"
#include "xfs_sb.h"
#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
#include "xfs_quota.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
@@ -30,6 +31,8 @@
#include "xfs_rtgroup.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_reflink.h"
/*
* Return whether there are any free extents in the size range given
@@ -592,7 +595,7 @@ xfs_rtalloc_sumlevel(
* specified. If we don't get maxlen then use prod to trim
* the length, if given. The lengths are all in rtextents.
*/
-STATIC int
+static int
xfs_rtallocate_extent_size(
struct xfs_rtalloc_args *args,
xfs_rtxlen_t minlen, /* minimum length to allocate */
@@ -845,6 +848,13 @@ xfs_growfs_rt_init_rtsb(
mp->m_rtsb_bp = rtsb_bp;
error = xfs_bwrite(rtsb_bp);
xfs_buf_unlock(rtsb_bp);
+ if (error)
+ return error;
+
+ /* Initialize the rtrmap to reflect the rtsb. */
+ if (rtg_rmap(args->rtg) != NULL)
+ error = xfs_rtrmapbt_init_rtsb(nargs->mp, args->rtg, args->tp);
+
return error;
}
@@ -856,8 +866,8 @@ xfs_growfs_rt_bmblock(
xfs_fileoff_t bmbno)
{
struct xfs_mount *mp = rtg_mount(rtg);
- struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
- struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
+ struct xfs_inode *rsumip = rtg_summary(rtg);
struct xfs_rtalloc_args args = {
.mp = mp,
.rtg = rtg,
@@ -893,8 +903,9 @@ xfs_growfs_rt_bmblock(
goto out_free;
nargs.tp = args.tp;
- xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP);
- xfs_rtgroup_trans_join(args.tp, args.rtg, XFS_RTGLOCK_BITMAP);
+ xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(args.tp, args.rtg,
+ XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_RMAP);
/*
* Update the bitmap inode's size ondisk and incore. We need to update
@@ -980,9 +991,12 @@ xfs_growfs_rt_bmblock(
goto out_free;
/*
- * Ensure the mount RT feature flag is now set.
+ * Ensure the mount RT feature flag is now set, and compute new
+ * maxlevels for rt btrees.
*/
mp->m_features |= XFS_FEAT_REALTIME;
+ xfs_rtrmapbt_compute_maxlevels(mp);
+ xfs_rtrefcountbt_compute_maxlevels(mp);
kfree(nmp);
return 0;
@@ -1041,8 +1055,8 @@ xfs_growfs_rt_alloc_blocks(
xfs_extlen_t *nrbmblocks)
{
struct xfs_mount *mp = rtg_mount(rtg);
- struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
- struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
+ struct xfs_inode *rsumip = rtg_summary(rtg);
xfs_extlen_t orbmblocks = 0;
xfs_extlen_t orsumblocks = 0;
struct xfs_mount *nmp;
@@ -1150,29 +1164,38 @@ out_rele:
return error;
}
-static int
+int
xfs_growfs_check_rtgeom(
const struct xfs_mount *mp,
+ xfs_rfsblock_t dblocks,
xfs_rfsblock_t rblocks,
xfs_extlen_t rextsize)
{
+ xfs_extlen_t min_logfsbs;
struct xfs_mount *nmp;
- int error = 0;
nmp = xfs_growfs_rt_alloc_fake_mount(mp, rblocks, rextsize);
if (!nmp)
return -ENOMEM;
+ nmp->m_sb.sb_dblocks = dblocks;
+
+ xfs_rtrmapbt_compute_maxlevels(nmp);
+ xfs_rtrefcountbt_compute_maxlevels(nmp);
+ xfs_trans_resv_calc(nmp, M_RES(nmp));
/*
* New summary size can't be more than half the size of the log. This
* prevents us from getting a log overflow, since we'll log basically
* the whole summary file at once.
*/
- if (nmp->m_rsumblocks > (mp->m_sb.sb_logblocks >> 1))
- error = -EINVAL;
+ min_logfsbs = min_t(xfs_extlen_t, xfs_log_calc_minimum_size(nmp),
+ nmp->m_rsumblocks * 2);
kfree(nmp);
- return error;
+
+ if (min_logfsbs > mp->m_sb.sb_logblocks)
+ return -EINVAL;
+ return 0;
}
/*
@@ -1263,11 +1286,17 @@ xfs_growfs_rt(
XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE)
goto out_unlock;
- /* Unsupported realtime features. */
+ /* Check for features supported only on rtgroups filesystems. */
error = -EOPNOTSUPP;
- if (xfs_has_quota(mp) && !xfs_has_rtgroups(mp))
- goto out_unlock;
- if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
+ if (!xfs_has_rtgroups(mp)) {
+ if (xfs_has_rmapbt(mp))
+ goto out_unlock;
+ if (xfs_has_quota(mp))
+ goto out_unlock;
+ if (xfs_has_reflink(mp))
+ goto out_unlock;
+ } else if (xfs_has_reflink(mp) &&
+ !xfs_reflink_supports_rextsize(mp, in->extsize))
goto out_unlock;
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
@@ -1291,7 +1320,8 @@ xfs_growfs_rt(
goto out_unlock;
/* Make sure the new fs size won't cause problems with the log. */
- error = xfs_growfs_check_rtgeom(mp, in->newblocks, in->extsize);
+ error = xfs_growfs_check_rtgeom(mp, mp->m_sb.sb_dblocks, in->newblocks,
+ in->extsize);
if (error)
goto out_unlock;
@@ -1344,6 +1374,12 @@ xfs_growfs_rt(
if (!error)
error = error2;
+
+ /* Reset the rt metadata btree space reservations. */
+ xfs_rt_resv_free(mp);
+ error2 = xfs_rt_resv_init(mp);
+ if (error2 && error2 != -ENOSPC)
+ error = error2;
}
out_unlock:
@@ -1371,7 +1407,7 @@ xfs_rtmount_readsb(
/* m_blkbb_log is not set up yet */
error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR,
- mp->m_sb.sb_blocksize >> BBSHIFT, XBF_NO_IOACCT, &bp,
+ mp->m_sb.sb_blocksize >> BBSHIFT, 0, &bp,
&xfs_rtsb_buf_ops);
if (error) {
xfs_warn(mp, "rt sb validate failed with error %d.", error);
@@ -1487,6 +1523,46 @@ xfs_rtalloc_reinit_frextents(
return 0;
}
+/* Free space reservations for rt metadata inodes. */
+void
+xfs_rt_resv_free(
+ struct xfs_mount *mp)
+{
+ struct xfs_rtgroup *rtg = NULL;
+ unsigned int i;
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ for (i = 0; i < XFS_RTGI_MAX; i++)
+ xfs_metafile_resv_free(rtg->rtg_inodes[i]);
+ }
+}
+
+/* Reserve space for rt metadata inodes' space expansion. */
+int
+xfs_rt_resv_init(
+ struct xfs_mount *mp)
+{
+ struct xfs_rtgroup *rtg = NULL;
+ xfs_filblks_t ask;
+ int error = 0;
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ int err2;
+
+ ask = xfs_rtrmapbt_calc_reserves(mp);
+ err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask);
+ if (err2 && !error)
+ error = err2;
+
+ ask = xfs_rtrefcountbt_calc_reserves(mp);
+ err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask);
+ if (err2 && !error)
+ error = err2;
+ }
+
+ return error;
+}
+
/*
* Read in the bmbt of an rt metadata inode so that we never have to load them
* at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use
@@ -1601,7 +1677,7 @@ xfs_rtpick_extent(
xfs_rtxlen_t len) /* allocation length (rtextents) */
{
struct xfs_mount *mp = rtg_mount(rtg);
- struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
xfs_rtxnum_t b = 0; /* result rtext */
int log2; /* log of sequence number */
uint64_t resid; /* residual after log removed */
@@ -1885,7 +1961,7 @@ out_unlock:
goto out_release;
}
-static int
+int
xfs_rtallocate_rtgs(
struct xfs_trans *tp,
xfs_fsblock_t bno_hint,
@@ -1950,7 +2026,10 @@ xfs_rtallocate_align(
if (*noalign) {
align = mp->m_sb.sb_rextsize;
} else {
- align = xfs_get_extsz_hint(ap->ip);
+ if (ap->flags & XFS_BMAPI_COWFORK)
+ align = xfs_get_cowextsz_hint(ap->ip);
+ else
+ align = xfs_get_extsz_hint(ap->ip);
if (!align)
align = 1;
if (align == mp->m_sb.sb_rextsize)
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 8e2a07b8174b..0d95b29092c9 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -34,6 +34,9 @@ int /* error */
xfs_rtmount_inodes(
struct xfs_mount *mp); /* file system mount structure */
+void xfs_rt_resv_free(struct xfs_mount *mp);
+int xfs_rt_resv_init(struct xfs_mount *mp);
+
/*
* Grow the realtime area of the filesystem.
*/
@@ -43,6 +46,8 @@ xfs_growfs_rt(
xfs_growfs_rt_t *in); /* user supplied growfs struct */
int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
+int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, xfs_rfsblock_t dblocks,
+ xfs_rfsblock_t rblocks, xfs_agblock_t rextsize);
#else
# define xfs_growfs_rt(mp,in) (-ENOSYS)
# define xfs_rtalloc_reinit_frextents(m) (0)
@@ -60,6 +65,21 @@ xfs_rtmount_init(
}
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
# define xfs_rtunmount_inodes(m)
+# define xfs_rt_resv_free(mp) ((void)0)
+# define xfs_rt_resv_init(mp) (0)
+
+static inline int
+xfs_growfs_check_rtgeom(const struct xfs_mount *mp,
+ xfs_rfsblock_t dblocks, xfs_rfsblock_t rblocks,
+ xfs_extlen_t rextsize)
+{
+ return 0;
+}
#endif /* CONFIG_XFS_RT */
+int xfs_rtallocate_rtgs(struct xfs_trans *tp, xfs_fsblock_t bno_hint,
+ xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, xfs_rtxlen_t prod,
+ bool wasdel, bool initial_user_data, xfs_rtblock_t *bno,
+ xfs_extlen_t *blen);
+
#endif /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index ffb52725c2a8..35c7fb3ba324 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -52,7 +52,10 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{ "rmapbt", xfsstats_offset(xs_refcbt_2) },
{ "refcntbt", xfsstats_offset(xs_rmap_mem_2) },
{ "rmapbt_mem", xfsstats_offset(xs_rcbag_2) },
- { "rcbagbt", xfsstats_offset(xs_qm_dqreclaims)},
+ { "rcbagbt", xfsstats_offset(xs_rtrmap_2) },
+ { "rtrmapbt", xfsstats_offset(xs_rtrmap_mem_2)},
+ { "rtrmapbt_mem", xfsstats_offset(xs_rtrefcbt_2) },
+ { "rtrefcntbt", xfsstats_offset(xs_qm_dqreclaims)},
/* we print both series of quota information together */
{ "qm", xfsstats_offset(xs_xstrat_bytes)},
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index a61fb56ed2e6..15ba1abcf253 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -127,6 +127,9 @@ struct __xfsstats {
uint32_t xs_refcbt_2[__XBTS_MAX];
uint32_t xs_rmap_mem_2[__XBTS_MAX];
uint32_t xs_rcbag_2[__XBTS_MAX];
+ uint32_t xs_rtrmap_2[__XBTS_MAX];
+ uint32_t xs_rtrmap_mem_2[__XBTS_MAX];
+ uint32_t xs_rtrefcbt_2[__XBTS_MAX];
uint32_t xs_qm_dqreclaims;
uint32_t xs_qm_dqreclaim_misses;
uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 394fdf3bb535..62d04f4843cf 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -819,20 +819,76 @@ xfs_fs_sync_fs(
return 0;
}
+static xfs_extlen_t
+xfs_internal_log_size(
+ struct xfs_mount *mp)
+{
+ if (!mp->m_sb.sb_logstart)
+ return 0;
+ return mp->m_sb.sb_logblocks;
+}
+
+static void
+xfs_statfs_data(
+ struct xfs_mount *mp,
+ struct kstatfs *st)
+{
+ int64_t fdblocks =
+ percpu_counter_sum(&mp->m_fdblocks);
+
+ /* make sure st->f_bfree does not underflow */
+ st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
+ /*
+ * sb_dblocks can change during growfs, but nothing cares about reporting
+ * the old or new value during growfs.
+ */
+ st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp);
+}
+
+/*
+ * When stat(v)fs is called on a file with the realtime bit set or a directory
+ * with the rtinherit bit, report freespace information for the RT device
+ * instead of the main data device.
+ */
+static void
+xfs_statfs_rt(
+ struct xfs_mount *mp,
+ struct kstatfs *st)
+{
+ st->f_bfree = xfs_rtbxlen_to_blen(mp,
+ percpu_counter_sum_positive(&mp->m_frextents));
+ st->f_blocks = mp->m_sb.sb_rblocks;
+}
+
+static void
+xfs_statfs_inodes(
+ struct xfs_mount *mp,
+ struct kstatfs *st)
+{
+ uint64_t icount = percpu_counter_sum(&mp->m_icount);
+ uint64_t ifree = percpu_counter_sum(&mp->m_ifree);
+ uint64_t fakeinos = XFS_FSB_TO_INO(mp, st->f_bfree);
+
+ st->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
+ if (M_IGEO(mp)->maxicount)
+ st->f_files = min_t(typeof(st->f_files), st->f_files,
+ M_IGEO(mp)->maxicount);
+
+ /* If sb_icount overshot maxicount, report actual allocation */
+ st->f_files = max_t(typeof(st->f_files), st->f_files,
+ mp->m_sb.sb_icount);
+
+ /* Make sure st->f_ffree does not underflow */
+ st->f_ffree = max_t(int64_t, 0, st->f_files - (icount - ifree));
+}
+
STATIC int
xfs_fs_statfs(
struct dentry *dentry,
- struct kstatfs *statp)
+ struct kstatfs *st)
{
struct xfs_mount *mp = XFS_M(dentry->d_sb);
- xfs_sb_t *sbp = &mp->m_sb;
struct xfs_inode *ip = XFS_I(d_inode(dentry));
- uint64_t fakeinos, id;
- uint64_t icount;
- uint64_t ifree;
- uint64_t fdblocks;
- xfs_extlen_t lsize;
- int64_t ffree;
/*
* Expedite background inodegc but don't wait. We do not want to block
@@ -840,58 +896,28 @@ xfs_fs_statfs(
*/
xfs_inodegc_push(mp);
- statp->f_type = XFS_SUPER_MAGIC;
- statp->f_namelen = MAXNAMELEN - 1;
-
- id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
- statp->f_fsid = u64_to_fsid(id);
+ st->f_type = XFS_SUPER_MAGIC;
+ st->f_namelen = MAXNAMELEN - 1;
+ st->f_bsize = mp->m_sb.sb_blocksize;
+ st->f_fsid = u64_to_fsid(huge_encode_dev(mp->m_ddev_targp->bt_dev));
- icount = percpu_counter_sum(&mp->m_icount);
- ifree = percpu_counter_sum(&mp->m_ifree);
- fdblocks = percpu_counter_sum(&mp->m_fdblocks);
-
- spin_lock(&mp->m_sb_lock);
- statp->f_bsize = sbp->sb_blocksize;
- lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
- statp->f_blocks = sbp->sb_dblocks - lsize;
- spin_unlock(&mp->m_sb_lock);
-
- /* make sure statp->f_bfree does not underflow */
- statp->f_bfree = max_t(int64_t, 0,
- fdblocks - xfs_fdblocks_unavailable(mp));
- statp->f_bavail = statp->f_bfree;
-
- fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
- statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
- if (M_IGEO(mp)->maxicount)
- statp->f_files = min_t(typeof(statp->f_files),
- statp->f_files,
- M_IGEO(mp)->maxicount);
-
- /* If sb_icount overshot maxicount, report actual allocation */
- statp->f_files = max_t(typeof(statp->f_files),
- statp->f_files,
- sbp->sb_icount);
-
- /* make sure statp->f_ffree does not underflow */
- ffree = statp->f_files - (icount - ifree);
- statp->f_ffree = max_t(int64_t, ffree, 0);
+ xfs_statfs_data(mp, st);
+ xfs_statfs_inodes(mp, st);
if (XFS_IS_REALTIME_MOUNT(mp) &&
- (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
- s64 freertx;
-
- statp->f_blocks = sbp->sb_rblocks;
- freertx = percpu_counter_sum_positive(&mp->m_frextents);
- statp->f_bavail = statp->f_bfree =
- xfs_rtbxlen_to_blen(mp, freertx);
- }
+ (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME)))
+ xfs_statfs_rt(mp, st);
if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
(XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
- xfs_qm_statvfs(ip, statp);
+ xfs_qm_statvfs(ip, st);
+ /*
+ * XFS does not distinguish between blocks available to privileged and
+ * unprivileged users.
+ */
+ st->f_bavail = st->f_bfree;
return 0;
}
@@ -1635,8 +1661,12 @@ xfs_fs_fill_super(
#endif
}
- /* Filesystem claims it needs repair, so refuse the mount. */
- if (xfs_has_needsrepair(mp)) {
+ /*
+ * Filesystem claims it needs repair, so refuse the mount unless
+ * norecovery is also specified, in which case the filesystem can
+ * be mounted with no risk of further damage.
+ */
+ if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) {
xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair.");
error = -EFSCORRUPTED;
goto out_free_sb;
@@ -1730,7 +1760,7 @@ xfs_fs_fill_super(
sb->s_time_max = XFS_LEGACY_TIME_MAX;
}
trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
- sb->s_iflags |= SB_I_CGROUPWB;
+ sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
set_posix_acl_flag(sb);
@@ -1754,9 +1784,11 @@ xfs_fs_fill_super(
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
if (xfs_has_reflink(mp)) {
- if (mp->m_sb.sb_rblocks) {
+ if (xfs_has_realtime(mp) &&
+ !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) {
xfs_alert(mp,
- "reflink not compatible with realtime device!");
+ "reflink not compatible with realtime extent size %u!",
+ mp->m_sb.sb_rextsize);
error = -EINVAL;
goto out_filestream_unmount;
}
@@ -1767,12 +1799,6 @@ xfs_fs_fill_super(
}
}
- if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) {
- xfs_alert(mp,
- "reverse mapping btree not compatible with realtime device!");
- error = -EINVAL;
- goto out_filestream_unmount;
- }
if (xfs_has_exchange_range(mp))
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE);
@@ -2096,7 +2122,8 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = xfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
+ FS_LBS,
};
MODULE_ALIAS_FS("xfs");
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 302e6e5d6c7e..c0e85c1e42f2 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -92,7 +92,6 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
extern const struct export_operations xfs_export_operations;
extern const struct quotactl_ops xfs_quotactl_operations;
-extern const struct dax_holder_operations xfs_dax_holder_operations;
extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index c84df23b494d..751dc74a3067 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -66,7 +66,7 @@ xfs_deprecated_dointvec_minmax(
return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
}
-static struct ctl_table xfs_table[] = {
+static const struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7b16cdd72e9d..bfc2f1249022 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -14,11 +14,15 @@
* ino: filesystem inode number
*
* agbno: per-AG block number in fs blocks
+ * rgbno: per-rtgroup block number in fs blocks
* startblock: physical block number for file mappings. This is either a
* segmented fsblock for data device mappings, or a rfsblock
* for realtime device mappings
* fsbcount: number of blocks in an extent, in fs blocks
*
+ * gbno: generic allocation group block number. This is an agbno for
+ * space in a per-AG or a rgbno for space in a realtime group.
+ *
* daddr: physical block number in 512b blocks
* bbcount: number of blocks in a physical extent, in 512b blocks
*
@@ -494,7 +498,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
- __entry->hold = atomic_read(&bp->b_hold);
+ __entry->hold = bp->b_hold;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
@@ -565,7 +569,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
__entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length;
__entry->flags = flags;
- __entry->hold = atomic_read(&bp->b_hold);
+ __entry->hold = bp->b_hold;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->caller_ip = caller_ip;
@@ -589,6 +593,7 @@ DEFINE_EVENT(xfs_buf_flags_class, name, \
DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_readahead);
TRACE_EVENT(xfs_buf_ioerror,
TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip),
@@ -608,7 +613,7 @@ TRACE_EVENT(xfs_buf_ioerror,
__entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length;
- __entry->hold = atomic_read(&bp->b_hold);
+ __entry->hold = bp->b_hold;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->error = error;
@@ -652,7 +657,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
__entry->buf_len = bip->bli_buf->b_length;
__entry->buf_flags = bip->bli_buf->b_flags;
- __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+ __entry->buf_hold = bip->bli_buf->b_hold;
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
__entry->buf_lockval = bip->bli_buf->b_sema.count;
__entry->li_flags = bip->bli_item.li_flags;
@@ -2295,6 +2300,7 @@ TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL);
TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS);
TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE);
TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_META_BTREE);
DECLARE_EVENT_CLASS(xfs_swap_extent_class,
TP_PROTO(struct xfs_inode *ip, int which),
@@ -2918,13 +2924,14 @@ DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
/* rmap tracepoints */
DECLARE_EVENT_CLASS(xfs_rmap_class,
TP_PROTO(struct xfs_btree_cur *cur,
- xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
+ xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten,
const struct xfs_owner_info *oinfo),
- TP_ARGS(cur, agbno, len, unwritten, oinfo),
+ TP_ARGS(cur, gbno, len, unwritten, oinfo),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
__field(xfs_extlen_t, len)
__field(uint64_t, owner)
__field(uint64_t, offset)
@@ -2932,8 +2939,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
__entry->len = len;
__entry->owner = oinfo->oi_owner;
__entry->offset = oinfo->oi_offset;
@@ -2941,10 +2949,11 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
if (unwritten)
__entry->flags |= XFS_RMAP_UNWRITTEN;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
+ TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
+ __entry->gbno,
__entry->len,
__entry->owner,
__entry->offset,
@@ -2953,9 +2962,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
#define DEFINE_RMAP_EVENT(name) \
DEFINE_EVENT(xfs_rmap_class, name, \
TP_PROTO(struct xfs_btree_cur *cur, \
- xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
+ xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten, \
const struct xfs_owner_info *oinfo), \
- TP_ARGS(cur, agbno, len, unwritten, oinfo))
+ TP_ARGS(cur, gbno, len, unwritten, oinfo))
/* btree cursor error/%ip tracepoint class */
DECLARE_EVENT_CLASS(xfs_btree_error_class,
@@ -3018,47 +3027,36 @@ TRACE_EVENT(xfs_rmap_convert_state,
TP_ARGS(cur, state, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_ino_t, ino)
__field(int, state)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- switch (cur->bc_ops->type) {
- case XFS_BTREE_TYPE_INODE:
- __entry->agno = 0;
- __entry->ino = cur->bc_ino.ip->i_ino;
- break;
- case XFS_BTREE_TYPE_AG:
- __entry->agno = cur->bc_group->xg_gno;
- __entry->ino = 0;
- break;
- case XFS_BTREE_TYPE_MEM:
- __entry->agno = 0;
- __entry->ino = 0;
- break;
- }
+ __entry->type = cur->bc_group->xg_type;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->state = state;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno 0x%x ino 0x%llx state %d caller %pS",
+ TP_printk("dev %d:%d %sno 0x%x state %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->ino,
__entry->state,
(char *)__entry->caller_ip)
);
DECLARE_EVENT_CLASS(xfs_rmapbt_class,
TP_PROTO(struct xfs_btree_cur *cur,
- xfs_agblock_t agbno, xfs_extlen_t len,
+ xfs_agblock_t gbno, xfs_extlen_t len,
uint64_t owner, uint64_t offset, unsigned int flags),
- TP_ARGS(cur, agbno, len, owner, offset, flags),
+ TP_ARGS(cur, gbno, len, owner, offset, flags),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
__field(xfs_extlen_t, len)
__field(uint64_t, owner)
__field(uint64_t, offset)
@@ -3066,17 +3064,19 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
__entry->len = len;
__entry->owner = owner;
__entry->offset = offset;
__entry->flags = flags;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
+ __entry->gbno,
__entry->len,
__entry->owner,
__entry->offset,
@@ -3085,9 +3085,9 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
#define DEFINE_RMAPBT_EVENT(name) \
DEFINE_EVENT(xfs_rmapbt_class, name, \
TP_PROTO(struct xfs_btree_cur *cur, \
- xfs_agblock_t agbno, xfs_extlen_t len, \
+ xfs_agblock_t gbno, xfs_extlen_t len, \
uint64_t owner, uint64_t offset, unsigned int flags), \
- TP_ARGS(cur, agbno, len, owner, offset, flags))
+ TP_ARGS(cur, gbno, len, owner, offset, flags))
TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
@@ -3104,8 +3104,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned long long, owner)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
__field(int, whichfork)
__field(xfs_fileoff_t, l_loff)
__field(xfs_filblks_t, l_len)
@@ -3114,9 +3115,11 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __entry->agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
- __entry->agbno = XFS_FSB_TO_AGBNO(mp,
- ri->ri_bmap.br_startblock);
+ __entry->type = ri->ri_group->xg_type;
+ __entry->agno = ri->ri_group->xg_gno;
+ __entry->gbno = xfs_fsb_to_gbno(mp,
+ ri->ri_bmap.br_startblock,
+ ri->ri_group->xg_type);
__entry->owner = ri->ri_owner;
__entry->whichfork = ri->ri_whichfork;
__entry->l_loff = ri->ri_bmap.br_startoff;
@@ -3124,11 +3127,12 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
__entry->l_state = ri->ri_bmap.br_state;
__entry->op = ri->ri_type;
),
- TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+ TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
+ __entry->gbno,
__entry->owner,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__entry->l_loff,
@@ -3302,56 +3306,62 @@ TRACE_EVENT(xfs_ag_resv_init_error,
/* refcount tracepoint classes */
DECLARE_EVENT_CLASS(xfs_refcount_class,
- TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno,
xfs_extlen_t len),
- TP_ARGS(cur, agbno, len),
+ TP_ARGS(cur, gbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
+ __entry->gbno,
__entry->len)
);
#define DEFINE_REFCOUNT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_class, name, \
- TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, \
xfs_extlen_t len), \
- TP_ARGS(cur, agbno, len))
+ TP_ARGS(cur, gbno, len))
TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
TRACE_EVENT(xfs_refcount_lookup,
- TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno,
xfs_lookup_t dir),
- TP_ARGS(cur, agbno, dir),
+ TP_ARGS(cur, gbno, dir),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
__field(xfs_lookup_t, dir)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
__entry->dir = dir;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)",
+ TP_printk("dev %d:%d %sno 0x%x gbno 0x%x cmp %s(%d)",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
+ __entry->gbno,
__print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR),
__entry->dir)
)
@@ -3362,6 +3372,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
TP_ARGS(cur, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
@@ -3370,14 +3381,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
@@ -3393,49 +3406,53 @@ DEFINE_EVENT(xfs_refcount_extent_class, name, \
/* single-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec,
- xfs_agblock_t agbno),
- TP_ARGS(cur, irec, agbno),
+ xfs_agblock_t gbno),
+ TP_ARGS(cur, irec, gbno),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount,
- __entry->agbno)
+ __entry->gbno)
)
#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \
- xfs_agblock_t agbno), \
- TP_ARGS(cur, irec, agbno))
+ xfs_agblock_t gbno), \
+ TP_ARGS(cur, irec, gbno))
/* double-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
- struct xfs_refcount_irec *i2),
+ struct xfs_refcount_irec *i2),
TP_ARGS(cur, i1, i2),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
@@ -3448,6 +3465,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
@@ -3458,9 +3476,10 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
- "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s gbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
@@ -3481,10 +3500,11 @@ DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
/* double-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
- struct xfs_refcount_irec *i2, xfs_agblock_t agbno),
- TP_ARGS(cur, i1, i2, agbno),
+ struct xfs_refcount_irec *i2, xfs_agblock_t gbno),
+ TP_ARGS(cur, i1, i2, gbno),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
@@ -3494,10 +3514,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
@@ -3507,11 +3528,12 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
- "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
@@ -3521,14 +3543,14 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount,
- __entry->agbno)
+ __entry->gbno)
)
#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
- struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \
- TP_ARGS(cur, i1, i2, agbno))
+ struct xfs_refcount_irec *i2, xfs_agblock_t gbno), \
+ TP_ARGS(cur, i1, i2, gbno))
/* triple-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
@@ -3537,6 +3559,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
TP_ARGS(cur, i1, i2, i3),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
@@ -3553,6 +3576,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
@@ -3567,10 +3591,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
__entry->i3_blockcount = i3->rc_blockcount;
__entry->i3_refcount = i3->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
- "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
- "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s gbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
@@ -3638,23 +3663,27 @@ DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
TP_ARGS(mp, refc),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(int, op)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock);
+ __entry->type = refc->ri_group->xg_type;
+ __entry->agno = refc->ri_group->xg_gno;
__entry->op = refc->ri_type;
- __entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock);
+ __entry->gbno = xfs_fsb_to_gbno(mp, refc->ri_startblock,
+ refc->ri_group->xg_type);
__entry->len = refc->ri_blockcount;
),
- TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
+ __entry->gbno,
__entry->len)
);
#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \
@@ -3993,7 +4022,7 @@ TRACE_EVENT(xfs_fsmap_mapping,
__entry->offset = frec->offset;
__entry->flags = frec->rm_flags;
),
- TP_printk("dev %d:%d keydev %d:%d agno 0x%x rmapbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
+ TP_printk("dev %d:%d keydev %d:%d agno 0x%x gbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
MAJOR(__entry->keydev), MINOR(__entry->keydev),
__entry->agno,
@@ -4950,7 +4979,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
__entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
- __entry->hold = atomic_read(&bp->b_hold);
+ __entry->hold = bp->b_hold;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
@@ -5574,6 +5603,72 @@ DEFINE_EVENT(xfs_metadir_class, name, \
TP_ARGS(dp, name, ino))
DEFINE_METADIR_EVENT(xfs_metadir_lookup);
+/* metadata inode space reservations */
+
+DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len),
+ TP_ARGS(ip, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned long long, freeblks)
+ __field(unsigned long long, reserved)
+ __field(unsigned long long, asked)
+ __field(unsigned long long, used)
+ __field(unsigned long long, len)
+ ),
+ TP_fast_assign(
+ struct xfs_mount *mp = ip->i_mount;
+
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
+ __entry->reserved = ip->i_delayed_blks;
+ __entry->asked = ip->i_meta_resv_asked;
+ __entry->used = ip->i_nblocks;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->freeblks,
+ __entry->reserved,
+ __entry->asked,
+ __entry->used,
+ __entry->len)
+)
+#define DEFINE_METAFILE_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_metafile_resv_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \
+ TP_ARGS(ip, len))
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical);
+DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error);
+
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xfs_growfs_check_rtgeom,
+ TP_PROTO(const struct xfs_mount *mp, unsigned int min_logfsbs),
+ TP_ARGS(mp, min_logfsbs),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, logblocks)
+ __field(unsigned int, min_logfsbs)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->logblocks = mp->m_sb.sb_logblocks;
+ __entry->min_logfsbs = min_logfsbs;
+ ),
+ TP_printk("dev %d:%d logblocks %u min_logfsbs %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->logblocks,
+ __entry->min_logfsbs)
+);
+#endif /* CONFIG_XFS_RT */
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 4cd25717c9d1..c6657072361a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -100,7 +100,6 @@ xfs_trans_dup(
/*
* Initialize the new transaction structure.
*/
- ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
ntp->t_mountp = tp->t_mountp;
INIT_LIST_HEAD(&ntp->t_items);
INIT_LIST_HEAD(&ntp->t_busy);
@@ -275,7 +274,6 @@ retry:
ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
xfs_has_lazysbcount(mp));
- tp->t_magic = XFS_TRANS_HEADER_MAGIC;
tp->t_flags = flags;
tp->t_mountp = mp;
INIT_LIST_HEAD(&tp->t_items);
@@ -1266,6 +1264,9 @@ retry:
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ if (xfs_is_metadir_inode(ip))
+ goto out;
+
error = xfs_qm_dqattach_locked(ip, false);
if (error) {
/* Caller should have allocated the dquots! */
@@ -1334,6 +1335,7 @@ retry:
goto out_cancel;
}
+out:
*tpp = tp;
return 0;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 71c2e82e4dad..2b366851e9a4 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -122,7 +122,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
* This is the structure maintained for every active transaction.
*/
typedef struct xfs_trans {
- unsigned int t_magic; /* magic number */
unsigned int t_log_res; /* amt of log space resvd */
unsigned int t_log_count; /* count for perm log res */
unsigned int t_blk_res; /* # of blocks resvd */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index f56d62dced97..0fcb1828e598 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -359,13 +359,8 @@ xfsaild_resubmit_item(
}
/* protected by ail_lock */
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- if (bp->b_flags & (_XBF_INODES | _XBF_DQUOTS))
- clear_bit(XFS_LI_FAILED, &lip->li_flags);
- else
- xfs_clear_li_failed(lip);
- }
-
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
xfs_buf_unlock(bp);
return XFS_ITEM_SUCCESS;
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8e886ecfd69a..53af546c0b23 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -659,7 +659,7 @@ xfs_trans_inode_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_BUF;
- bp->b_flags |= _XBF_INODES;
+ bp->b_iodone = xfs_buf_inode_iodone;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
@@ -684,7 +684,7 @@ xfs_trans_stale_inode_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_STALE_INODE;
- bp->b_flags |= _XBF_INODES;
+ bp->b_iodone = xfs_buf_inode_iodone;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
@@ -709,7 +709,7 @@ xfs_trans_inode_alloc_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
- bp->b_flags |= _XBF_INODES;
+ bp->b_iodone = xfs_buf_inode_iodone;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
@@ -820,6 +820,6 @@ xfs_trans_dquot_buf(
break;
}
- bp->b_flags |= _XBF_DQUOTS;
+ bp->b_iodone = xfs_buf_dquot_iodone;
xfs_trans_buf_set_type(tp, bp, type);
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 713b6d243e56..765456bf3428 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -156,7 +156,8 @@ xfs_trans_mod_ino_dquot(
unsigned int field,
int64_t delta)
{
- ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip));
+ if (xfs_is_metadir_inode(ip))
+ return;
xfs_trans_mod_dquot(tp, dqp, field, delta);
@@ -246,11 +247,10 @@ xfs_trans_mod_dquot_byino(
xfs_mount_t *mp = tp->t_mountp;
if (!XFS_IS_QUOTA_ON(mp) ||
- xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+ xfs_is_quota_inode(&mp->m_sb, ip->i_ino) ||
+ xfs_is_metadir_inode(ip))
return;
- ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip));
-
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta);
if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)