From bf4baaa087e2be0279991f1dbf9acaa7a4c9148c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 5 Oct 2024 17:37:02 -0400
Subject: bcachefs: Fix lockdep splat in bch2_accounting_read

We can't take sb_lock while holding mark_lock, so split out
replicas_entry_validate() and replicas_entry_sb_validate() -
replicas_entry_validate() now uses the normal online device interface.

00039 ========= TEST   set_option
00039
00039 WATCHDOG 30
00040 bcachefs (vdb): starting version 1.12: rebalance_work_acct_fix opts=errors=panic
00040 bcachefs (vdb): initializing new filesystem
00040 bcachefs (vdb): going read-write
00040 bcachefs (vdb): marking superblocks
00040 bcachefs (vdb): initializing freespace
00040 bcachefs (vdb): done initializing freespace
00040 bcachefs (vdb): reading snapshots table
00040 bcachefs (vdb): reading snapshots done
00040 bcachefs (vdb): done starting filesystem
00040 zstd
00041 bcachefs (vdb): shutting down
00041 bcachefs (vdb): going read-only
00041 bcachefs (vdb): finished waiting for writes to stop
00041 bcachefs (vdb): flushing journal and stopping allocators, journal seq 3
00041 bcachefs (vdb): flushing journal and stopping allocators complete, journal seq 11
00041 bcachefs (vdb): shutdown complete, journal seq 12
00041 bcachefs (vdb): marking filesystem clean
00041 bcachefs (vdb): shutdown complete
00041 Setting option on offline fs
00041 bch2_write_super(): fatal error : attempting to write superblock that wasn't version downgraded (1.12: (unknown version) > 1.10: disk_accounting_v3)
00041 fatal error - emergency read only
00041 bch2_write_super(): fatal error : attempting to write superblock that wasn't version downgraded (1.12: (unknown version) > 1.10: disk_accounting_v3)
00042 bcachefs (vdb): starting version 1.12: rebalance_work_acct_fix opts=errors=panic,compression=zstd
00042 bcachefs (vdb): recovering from clean shutdown, journal seq 12
00042 bcachefs (vdb): accounting_read...
00042
00042 ======================================================
00042 WARNING: possible circular locking dependency detected
00042 6.12.0-rc1-ktest-g805e938a8502 #6807 Not tainted
00042 ------------------------------------------------------
00042 mount.bcachefs/665 is trying to acquire lock:
00045 ffffff80cc280908 (&c->sb_lock){+.+.}-{3:3}, at: bch2_replicas_entry_validate (fs/bcachefs/replicas.c:102)
00045
00045 but task is already holding lock:
00048 ffffff80cc284870 (&c->mark_lock){++++}-{0:0}, at: bch2_accounting_read (fs/bcachefs/disk_accounting.c:670 (discriminator 1))
00048
00048 which lock already depends on the new lock.
00048
00048
00048 the existing dependency chain (in reverse order) is:
00048
00048 -> #1 (&c->mark_lock){++++}-{0:0}:
00049 percpu_down_write (kernel/locking/percpu-rwsem.c:232)
00052 bch2_sb_replicas_to_cpu_replicas (fs/bcachefs/replicas.c:583)
00055 bch2_sb_to_fs (fs/bcachefs/super-io.c:614)
00057 bch2_fs_open (fs/bcachefs/super.c:828 fs/bcachefs/super.c:2050)
00060 bch2_fs_get_tree (fs/bcachefs/fs.c:2067)
00062 vfs_get_tree (fs/super.c:1801)
00064 path_mount (fs/namespace.c:3507 fs/namespace.c:3834)
00066 __arm64_sys_mount (fs/namespace.c:3847 fs/namespace.c:4055 fs/namespace.c:4032 fs/namespace.c:4032)
00067 invoke_syscall.constprop.0 (arch/arm64/include/asm/syscall.h:61 arch/arm64/kernel/syscall.c:54)
00068 do_el0_svc (include/linux/thread_info.h:127 (discriminator 2) arch/arm64/kernel/syscall.c:140 (discriminator 2) arch/arm64/kernel/syscall.c:151 (discriminator 2))
00069 el0_svc (arch/arm64/include/asm/irqflags.h:82 arch/arm64/include/asm/irqflags.h:123 arch/arm64/include/asm/irqflags.h:136 arch/arm64/kernel/entry-common.c:165 arch/arm64/kernel/entry-common.c:178 arch/arm64/kernel/entry-common.c:713)
00069 ========= FAILED TIMEOUT set_option in 30s

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index bcb3276747e0..797da1032318 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
 	prt_printf(out, "]");
 }
 
-static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r,
-					       struct bch_sb *sb,
-					       struct printbuf *err)
+static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
+					   struct bch_sb *sb,
+					   struct printbuf *err)
 {
 	if (!r->nr_devs) {
 		prt_printf(err, "no devices in entry ");
@@ -98,10 +98,28 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
 				 struct bch_fs *c,
 				 struct printbuf *err)
 {
-	mutex_lock(&c->sb_lock);
-	int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err);
-	mutex_unlock(&c->sb_lock);
-	return ret;
+	if (!r->nr_devs) {
+		prt_printf(err, "no devices in entry ");
+		goto bad;
+	}
+
+	if (r->nr_required > 1 &&
+	    r->nr_required >= r->nr_devs) {
+		prt_printf(err, "bad nr_required in entry ");
+		goto bad;
+	}
+
+	for (unsigned i = 0; i < r->nr_devs; i++)
+		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+		    !bch2_dev_exists(c, r->devs[i])) {
+			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
+			goto bad;
+		}
+
+	return 0;
+bad:
+	bch2_replicas_entry_to_text(err, r);
+	return -BCH_ERR_invalid_replicas_entry;
 }
 
 void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -686,7 +704,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 		struct bch_replicas_entry_v1 *e =
 			cpu_replicas_entry(cpu_r, i);
 
-		int ret = bch2_replicas_entry_validate_locked(e, sb, err);
+		int ret = bch2_replicas_entry_sb_validate(e, sb, err);
 		if (ret)
 			return ret;
 
-- 
cgit v1.2.3


From bade9711e0905eaa99e2ed98fc9642acaf9ba2b5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 29 Sep 2024 23:40:28 -0400
Subject: bcachefs: Split out check_unreachable_inodes() pass

With inode backpointers, we can write a very simple
check_unreachable_inodes() pass that only looks for non-unlinked inodes
that are missing backpointers, and reattaches them.

This simplifies check_directory_structure() so that it's now only
checking for directory structure loops,

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c                  | 100 +++++++++++++++++++++++-------------
 fs/bcachefs/fsck.h                  |   1 +
 fs/bcachefs/recovery_passes_types.h |   1 +
 3 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b8a6ceb0cc7a..257366ec7939 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1292,6 +1292,58 @@ int bch2_check_inodes(struct bch_fs *c)
 	return ret;
 }
 
+static int check_unreachable_inode(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (!bkey_is_inode(k.k))
+		return 0;
+
+	struct bch_inode_unpacked inode;
+	BUG_ON(bch2_inode_unpack(k, &inode));
+
+	if (inode.bi_subvol)
+		return 0;
+
+	if (inode.bi_flags & BCH_INODE_unlinked)
+		return 0;
+
+	if (fsck_err_on(!inode.bi_dir,
+			trans, inode_unreachable,
+			"unreachable inode:\n%s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k),
+			 buf.buf)))
+		ret = reattach_inode(trans, &inode);
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * Reattach unreachable (but not unlinked) inodes
+ *
+ * Run after check_inodes() and check_dirents(), so we node that inode
+ * backpointer fields point to valid dirents, and every inode that has a dirent
+ * that points to it has its backpointer field set - so we're just looking for
+ * non-unlinked inodes without backpointers:
+ */
+int bch2_check_unreachable_inodes(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+				POS_MIN,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_unreachable_inode(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
 {
 	switch (btree) {
@@ -2450,22 +2502,6 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
 		if (ret)
 			break;
 
-		/*
-		 * We've checked that inode backpointers point to valid dirents;
-		 * here, it's sufficient to check that the subvolume root has a
-		 * dirent:
-		 */
-		if (fsck_err_on(!subvol_root.bi_dir,
-				trans, subvol_unreachable,
-				"unreachable subvolume %s",
-				(bch2_bkey_val_to_text(&buf, c, s.s_c),
-				 prt_newline(&buf),
-				 bch2_inode_unpacked_to_text(&buf, &subvol_root),
-				 buf.buf))) {
-			ret = reattach_subvol(trans, s);
-			break;
-		}
-
 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
 
 		if (darray_u32_has(&subvol_path, parent)) {
@@ -2526,12 +2562,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 	return false;
 }
 
-/*
- * Check that a given inode is reachable from its subvolume root - we already
- * verified subvolume connectivity:
- *
- * XXX: we should also be verifying that inodes are in the right subvolumes
- */
 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
 {
 	struct bch_fs *c = trans->c;
@@ -2545,6 +2575,9 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 
 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
 
+	if (!S_ISDIR(inode.bi_mode))
+		return 0;
+
 	while (!inode.bi_subvol) {
 		struct btree_iter dirent_iter;
 		struct bkey_s_c_dirent d;
@@ -2559,21 +2592,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 			bch2_trans_iter_exit(trans, &dirent_iter);
 
 		if (bch2_err_matches(ret, ENOENT)) {
-			ret = 0;
-			if (fsck_err(trans, inode_unreachable,
-				     "unreachable inode\n%s",
-				     (printbuf_reset(&buf),
-				      bch2_bkey_val_to_text(&buf, c, inode_k),
-				      buf.buf)))
-				ret = reattach_inode(trans, &inode);
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, inode_k);
+			bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
+				bch2_err_str(ret), buf.buf);
 			goto out;
 		}
 
 		bch2_trans_iter_exit(trans, &dirent_iter);
 
-		if (!S_ISDIR(inode.bi_mode))
-			break;
-
 		ret = darray_push(p, ((struct pathbuf_entry) {
 			.inum		= inode.bi_inum,
 			.snapshot	= snapshot,
@@ -2626,9 +2653,8 @@ fsck_err:
 }
 
 /*
- * Check for unreachable inodes, as well as loops in the directory structure:
- * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
- * unreachable:
+ * Check for loops in the directory structure: all other connectivity issues
+ * have been fixed by prior passes
  */
 int bch2_check_directory_structure(struct bch_fs *c)
 {
@@ -2756,6 +2782,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 			if (S_ISDIR(u.bi_mode))
 				continue;
 
+			/*
+			 * Previous passes ensured that bi_nlink is nonzero if
+			 * it had multiple hardlinks:
+			 */
 			if (!u.bi_nlink)
 				continue;
 
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index a4ef94271784..1cca31011530 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -9,6 +9,7 @@ int bch2_check_dirents(struct bch_fs *);
 int bch2_check_xattrs(struct bch_fs *);
 int bch2_check_root(struct bch_fs *);
 int bch2_check_subvolume_structure(struct bch_fs *);
+int bch2_check_unreachable_inodes(struct bch_fs *);
 int bch2_check_directory_structure(struct bch_fs *);
 int bch2_check_nlinks(struct bch_fs *);
 int bch2_fix_reflink_p(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 50406ce0e4ef..9d96c06e365c 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -46,6 +46,7 @@
 	x(check_dirents,			27, PASS_FSCK)			\
 	x(check_xattrs,				28, PASS_FSCK)			\
 	x(check_root,				29, PASS_ONLINE|PASS_FSCK)	\
+	x(check_unreachable_inodes,		40, PASS_ONLINE|PASS_FSCK)	\
 	x(check_subvolume_structure,		36, PASS_ONLINE|PASS_FSCK)	\
 	x(check_directory_structure,		30, PASS_ONLINE|PASS_FSCK)	\
 	x(check_nlinks,				31, PASS_FSCK)			\
-- 
cgit v1.2.3


From 38864eccf78b4e8ab9e2b7a4320943b1feb6872a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 30 Sep 2024 19:03:19 -0400
Subject: bcachefs: reattach_inode() now correctly handles interior snapshot
 nodes

When we find an unreachable inode, we now reattach it in the oldest
version that needs to be reattached (thus avoiding redundant work
reattaching every single version), and we now fix up inode -> dirent
backpointers in newer versions as needed - or white out the reattaching
dirent in newer versions, if the newer version isn't supposed to be
reattached.

This results in the second verify fsck now passing cleanly after
repairing on a user-provided filesystem image with thousands of
different snapshots.

Reported-by: Christopher Snowhill <chris@kode54.net>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h |   8 +++
 fs/bcachefs/fsck.c       | 170 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 158 insertions(+), 20 deletions(-)

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 78e63ad7d380..31a58bf46fdb 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
 	for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
 					  SPOS_MAX, _flags, _k, _ret)
 
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id,	\
+					     _start, _flags, _k, _ret)	\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_rewind(&(_iter)))
+
 #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
 	for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 257366ec7939..92f9cabb6eae 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -326,17 +326,54 @@ err:
 	return ret;
 }
 
+static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
+{
+	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
+	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
+		return false;
+
+	return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
+}
+
+static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
+					SPOS(d_pos.inode, d_pos.offset, snapshot),
+					BTREE_ITER_intent|
+					BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (bpos_eq(k.k->p, d_pos)) {
+		/*
+		 * delet_at() doesn't work because the update path doesn't
+		 * internally use BTREE_ITER_with_updates yet
+		 */
+		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+		ret = PTR_ERR_OR_ZERO(k);
+		if (ret)
+			goto err;
+
+		bkey_init(&k->k);
+		k->k.type = KEY_TYPE_whiteout;
+		k->k.p = iter.pos;
+		ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_hash_info dir_hash;
 	struct bch_inode_unpacked lostfound;
 	char name_buf[20];
-	struct qstr name;
-	u64 dir_offset = 0;
-	u32 dirent_snapshot = inode->bi_snapshot;
 	int ret;
 
+	u32 dirent_snapshot = inode->bi_snapshot;
 	if (inode->bi_subvol) {
 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
 
@@ -367,9 +404,10 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
 	if (ret)
 		return ret;
 
-	dir_hash = bch2_hash_info_init(c, &lostfound);
+	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
+	struct qstr name = (struct qstr) QSTR(name_buf);
 
-	name = (struct qstr) QSTR(name_buf);
+	inode->bi_dir = lostfound.bi_inum;
 
 	ret = bch2_dirent_create_snapshot(trans,
 				inode->bi_parent_subvol, lostfound.bi_inum,
@@ -378,17 +416,70 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
 				inode_d_type(inode),
 				&name,
 				inode->bi_subvol ?: inode->bi_inum,
-				&dir_offset,
+				&inode->bi_dir_offset,
 				STR_HASH_must_create);
 	if (ret) {
 		bch_err_msg(c, ret, "error creating dirent");
 		return ret;
 	}
 
-	inode->bi_dir		= lostfound.bi_inum;
-	inode->bi_dir_offset	= dir_offset;
+	ret = __bch2_fsck_write_inode(trans, inode);
+	if (ret)
+		return ret;
+
+	/*
+	 * Fix up inodes in child snapshots: if they should also be reattached
+	 * update the backpointer field, if they should not be we need to emit
+	 * whiteouts for the dirent we just created.
+	 */
+	if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
+		snapshot_id_list whiteouts_done;
+		struct btree_iter iter;
+		struct bkey_s_c k;
+
+		darray_init(&whiteouts_done);
 
-	return __bch2_fsck_write_inode(trans, inode);
+		for_each_btree_key_reverse_norestart(trans, iter,
+				BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
+				BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
+			if (k.k->p.offset != inode->bi_inum)
+				break;
+
+			if (!bkey_is_inode(k.k) ||
+			    !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
+			    snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
+				continue;
+
+			struct bch_inode_unpacked child_inode;
+			bch2_inode_unpack(k, &child_inode);
+
+			if (!inode_should_reattach(&child_inode)) {
+				ret = maybe_delete_dirent(trans,
+							  SPOS(lostfound.bi_inum, inode->bi_dir_offset,
+							       dirent_snapshot),
+							  k.k->p.snapshot);
+				if (ret)
+					break;
+
+				ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
+				if (ret)
+					break;
+			} else {
+				iter.snapshot = k.k->p.snapshot;
+				child_inode.bi_dir = inode->bi_dir;
+				child_inode.bi_dir_offset = inode->bi_dir_offset;
+
+				ret = bch2_inode_write_flags(trans, &iter, &child_inode,
+							     BTREE_UPDATE_internal_snapshot_node);
+				if (ret)
+					break;
+			}
+		}
+		darray_exit(&whiteouts_done);
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	return ret;
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -1292,11 +1383,49 @@ int bch2_check_inodes(struct bch_fs *c)
 	return ret;
 }
 
+static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
+					    struct bch_inode_unpacked *inode)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	/*
+	 * We look for inodes to reattach in natural key order, leaves first,
+	 * but we should do the reattach at the oldest version that needs to be
+	 * reattached:
+	 */
+	for_each_btree_key_norestart(trans, iter,
+				     BTREE_ID_inodes,
+				     SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
+				     BTREE_ITER_all_snapshots, k, ret) {
+		if (k.k->p.offset != inode->bi_inum)
+			break;
+
+		if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
+			continue;
+
+		if (!bkey_is_inode(k.k))
+			break;
+
+		struct bch_inode_unpacked parent_inode;
+		bch2_inode_unpack(k, &parent_inode);
+
+		if (!inode_should_reattach(&parent_inode))
+			break;
+
+		*inode = parent_inode;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
 static int check_unreachable_inode(struct btree_trans *trans,
 				   struct btree_iter *iter,
 				   struct bkey_s_c k)
 {
-	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
@@ -1306,18 +1435,17 @@ static int check_unreachable_inode(struct btree_trans *trans,
 	struct bch_inode_unpacked inode;
 	BUG_ON(bch2_inode_unpack(k, &inode));
 
-	if (inode.bi_subvol)
+	if (!inode_should_reattach(&inode))
 		return 0;
 
-	if (inode.bi_flags & BCH_INODE_unlinked)
-		return 0;
+	ret = find_oldest_inode_needs_reattach(trans, &inode);
+	if (ret)
+		return ret;
 
-	if (fsck_err_on(!inode.bi_dir,
-			trans, inode_unreachable,
-			"unreachable inode:\n%s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k),
-			 buf.buf)))
+	if (fsck_err(trans, inode_unreachable,
+		     "unreachable inode:\n%s",
+		     (bch2_inode_unpacked_to_text(&buf, &inode),
+		      buf.buf)))
 		ret = reattach_inode(trans, &inode);
 fsck_err:
 	printbuf_exit(&buf);
@@ -1331,6 +1459,8 @@ fsck_err:
  * backpointer fields point to valid dirents, and every inode that has a dirent
  * that points to it has its backpointer field set - so we're just looking for
  * non-unlinked inodes without backpointers:
+ *
+ * XXX: this is racy w.r.t. hardlink removal in online fsck
  */
 int bch2_check_unreachable_inodes(struct bch_fs *c)
 {
-- 
cgit v1.2.3


From 12f286085bf592346cffa93d8e21b0cc2c01f9fe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 4 Oct 2024 21:40:13 -0400
Subject: bcachefs: btree_iter_peek_upto() now handles BTREE_ITER_all_snapshots

end_pos now compares against snapshot ID when required

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bfe9f0c1e1be..0883cf6e1a3e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2381,9 +2381,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 		else
 			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
 
-		if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
-			     ? bkey_gt(iter_pos, end)
-			     : bkey_ge(iter_pos, end)))
+		if (unlikely(iter->flags & BTREE_ITER_all_snapshots	? bpos_gt(iter_pos, end) :
+			     iter->flags & BTREE_ITER_is_extents	? bkey_ge(iter_pos, end) :
+									  bkey_gt(iter_pos, end)))
 			goto end;
 
 		break;
-- 
cgit v1.2.3


From cba31b7eee41eb34941d040bddaed3628f160cae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 28 Sep 2024 23:30:05 -0400
Subject: bcachefs: Delete vestigal check_inode() checks

BCH_INODE_i_size_dirty dates from before we had logged operations for
truncate (as well as finsert) - it hasn't been needed since before
bcachefs was mainlined.

BCH_INODE_i_sectors_dirty hasn't been needed since we started always
updating i_sectors transactionally - it's been unused for even longer.

BCH_INODE_backptr_untrusted also hasn't been used since prior to
mainlining; when unlinking a hardling, we zero out the backpointer
fields if they're for the dirent being removed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 80 ++++--------------------------------------------------
 1 file changed, 5 insertions(+), 75 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 92f9cabb6eae..171e3e47db5c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1085,7 +1085,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
 		 */
 		inode->bi_dir = 0;
 		inode->bi_dir_offset = 0;
-		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
 		*write_inode = true;
 	}
 
@@ -1117,8 +1116,7 @@ static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bkey_s_c k,
 		       struct bch_inode_unpacked *prev,
-		       struct snapshots_seen *s,
-		       bool full)
+		       struct snapshots_seen *s)
 {
 	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
@@ -1141,12 +1139,6 @@ static int check_inode(struct btree_trans *trans,
 
 	BUG_ON(bch2_inode_unpack(k, &u));
 
-	if (!full &&
-	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
-			    BCH_INODE_i_sectors_dirty|
-			    BCH_INODE_unlinked)))
-		return 0;
-
 	if (prev->bi_inum != u.bi_inum)
 		*prev = u;
 
@@ -1192,7 +1184,7 @@ static int check_inode(struct btree_trans *trans,
 		ret = 0;
 	}
 
-	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
+	if ((u.bi_flags & BCH_INODE_unlinked) &&
 	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
 		struct bpos new_min_pos;
 
@@ -1200,7 +1192,7 @@ static int check_inode(struct btree_trans *trans,
 		if (ret)
 			goto err;
 
-		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
+		u.bi_flags &= ~BCH_INODE_unlinked;
 
 		ret = __bch2_fsck_write_inode(trans, &u);
 
@@ -1249,66 +1241,6 @@ static int check_inode(struct btree_trans *trans,
 		}
 	}
 
-	/* i_size_dirty is vestigal, since we now have logged ops for truncate * */
-	if (u.bi_flags & BCH_INODE_i_size_dirty &&
-	    (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
-	     fsck_err(trans, inode_i_size_dirty_but_clean,
-		      "filesystem marked clean, but inode %llu has i_size dirty",
-		      u.bi_inum))) {
-		bch_verbose(c, "truncating inode %llu", u.bi_inum);
-
-		/*
-		 * XXX: need to truncate partial blocks too here - or ideally
-		 * just switch units to bytes and that issue goes away
-		 */
-		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
-				     iter->pos.snapshot),
-				POS(u.bi_inum, U64_MAX),
-				0, NULL);
-		bch_err_msg(c, ret, "in fsck truncating inode");
-		if (ret)
-			return ret;
-
-		/*
-		 * We truncated without our normal sector accounting hook, just
-		 * make sure we recalculate it:
-		 */
-		u.bi_flags |= BCH_INODE_i_sectors_dirty;
-
-		u.bi_flags &= ~BCH_INODE_i_size_dirty;
-		do_update = true;
-	}
-
-	/* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */
-	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
-	    (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
-	     fsck_err(trans, inode_i_sectors_dirty_but_clean,
-		      "filesystem marked clean, but inode %llu has i_sectors dirty",
-		      u.bi_inum))) {
-		s64 sectors;
-
-		bch_verbose(c, "recounting sectors for inode %llu",
-			    u.bi_inum);
-
-		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
-		if (sectors < 0) {
-			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
-			return sectors;
-		}
-
-		u.bi_sectors = sectors;
-		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
-		do_update = true;
-	}
-
-	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
-		u.bi_dir = 0;
-		u.bi_dir_offset = 0;
-		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
-		do_update = true;
-	}
-
 	if (fsck_err_on(u.bi_parent_subvol &&
 			(u.bi_subvol == 0 ||
 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
@@ -1365,7 +1297,6 @@ err_noprint:
 
 int bch2_check_inodes(struct bch_fs *c)
 {
-	bool full = c->opts.fsck;
 	struct bch_inode_unpacked prev = { 0 };
 	struct snapshots_seen s;
 
@@ -1376,7 +1307,7 @@ int bch2_check_inodes(struct bch_fs *c)
 				POS_MIN,
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_inode(trans, &iter, k, &prev, &s, full)));
+			check_inode(trans, &iter, k, &prev, &s)));
 
 	snapshots_seen_exit(&s);
 	bch_err_fn(c, ret);
@@ -1876,8 +1807,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
 				continue;
 
-			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
-					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+			if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
 					!bkey_extent_is_reservation(k),
 					trans, extent_past_end_of_inode,
 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
-- 
cgit v1.2.3


From 9b23fdbd5d29beb5bd272c304e0d978edd32f513 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 29 Sep 2024 22:11:37 -0400
Subject: bcachefs: bcachefs_metadata_version_inode_has_child_snapshots

There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.

In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.

This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".

To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
  Unlinked, non open files are deleted, either at recovery time or
  during online fsck

The new rule is:
  Unlinked, non open files, that do not exist in child snapshots, are
  deleted.

To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.

For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h  |   3 +-
 fs/bcachefs/fs.c               |  21 +++-
 fs/bcachefs/fs.h               |   9 +-
 fs/bcachefs/fsck.c             |  51 +++-----
 fs/bcachefs/inode.c            | 274 ++++++++++++++++++++++++++++++++++++-----
 fs/bcachefs/inode.h            |  10 ++
 fs/bcachefs/inode_format.h     |   3 +-
 fs/bcachefs/sb-downgrade.c     |   5 +-
 fs/bcachefs/sb-errors_format.h |   4 +-
 9 files changed, 302 insertions(+), 78 deletions(-)

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 84832c2d4df9..5004f6ba997c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -678,7 +678,8 @@ struct bch_sb_field_ext {
 	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
 	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
 	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
-	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))
+	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))		\
+	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 5bfc26d58270..23cae92d313d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -174,11 +174,30 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
 	.automatic_shrinking	= true,
 };
 
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
 {
 	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
 }
 
+bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
+{
+	if (!test_bit(BCH_FS_started, &c->flags))
+		return false;
+
+	subvol_inum inum = {
+		.subvol = snapshot_t(c, p.snapshot)->subvol,
+		.inum	= p.offset,
+	};
+
+	/* snapshot tree interior node, can't safely delete while online (yet) */
+	if (!inum.subvol) {
+		bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
+		return true;
+	}
+
+	return __bch2_inode_hash_find(c, inum) != NULL;
+}
+
 static void __wait_on_freeing_inode(struct bch_fs *c,
 				    struct bch_inode_info *inode,
 				    subvol_inum inum)
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index da74ecc236e7..40dbd5774d0b 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
 	return inode->ei_inum;
 }
 
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
-
 /*
  * Set if we've gotten a btree error for this inode, and thus the vfs inode and
  * btree inode may be inconsistent:
@@ -181,6 +179,8 @@ void bch2_inode_update_after_write(struct btree_trans *,
 int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 				  inode_set_fn, void *, unsigned);
 
+bool bch2_inode_is_open(struct bch_fs *c, struct bpos p);
+
 int bch2_setattr_nonsize(struct mnt_idmap *,
 			 struct bch_inode_info *,
 			 struct iattr *);
@@ -198,10 +198,7 @@ int bch2_vfs_init(void);
 
 #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
 
-static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
-{
-	return NULL;
-}
+static inline bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { return false; }
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
 					       snapshot_id_list *s) {}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 171e3e47db5c..f00a36f62323 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1096,22 +1096,6 @@ fsck_err:
 	return ret;
 }
 
-static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
-{
-	subvol_inum inum = {
-		.subvol = snapshot_t(c, p.snapshot)->subvol,
-		.inum	= p.offset,
-	};
-
-	/* snapshot tree corruption, can't safely delete */
-	if (!inum.subvol) {
-		bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
-		return true;
-	}
-
-	return __bch2_inode_hash_find(c, inum) != NULL;
-}
-
 static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bkey_s_c k,
@@ -1184,28 +1168,27 @@ static int check_inode(struct btree_trans *trans,
 		ret = 0;
 	}
 
-	if ((u.bi_flags & BCH_INODE_unlinked) &&
-	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
-		struct bpos new_min_pos;
-
-		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
-		if (ret)
-			goto err;
-
-		u.bi_flags &= ~BCH_INODE_unlinked;
-
-		ret = __bch2_fsck_write_inode(trans, &u);
+	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+	if (ret < 0)
+		goto err;
 
-		bch_err_msg(c, ret, "in fsck updating inode");
+	if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
+			trans, inode_has_child_snapshots_wrong,
+			"inode has_child_snapshots flag wrong (should be %u)\n%s",
+			ret,
+			(printbuf_reset(&buf),
+			 bch2_inode_unpacked_to_text(&buf, &u),
+			 buf.buf))) {
 		if (ret)
-			goto err_noprint;
-
-		if (!bpos_eq(new_min_pos, POS_MIN))
-			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
-		goto err_noprint;
+			u.bi_flags |= BCH_INODE_has_child_snapshot;
+		else
+			u.bi_flags &= ~BCH_INODE_has_child_snapshot;
+		do_update = true;
 	}
+	ret = 0;
 
-	if (u.bi_flags & BCH_INODE_unlinked) {
+	if ((u.bi_flags & BCH_INODE_unlinked) &&
+	    !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
 		if (!test_bit(BCH_FS_started, &c->flags)) {
 			/*
 			 * If we're not in online fsck, don't delete unlinked
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 74d7a42ba1a2..9d6040d4ba39 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -12,6 +12,7 @@
 #include "error.h"
 #include "extents.h"
 #include "extent_update.h"
+#include "fs.h"
 #include "inode.h"
 #include "str_hash.h"
 #include "snapshot.h"
@@ -34,6 +35,8 @@ static const char * const bch2_inode_flag_strs[] = {
 };
 #undef  x
 
+static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
+
 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
 
 static int inode_decode_field(const u8 *in, const u8 *end,
@@ -575,9 +578,137 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k)
 	}
 }
 
-static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
+		return;
+	case KEY_TYPE_inode_v2:
+		bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
+		return;
+	case KEY_TYPE_inode_v3:
+		bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
+		return;
+	default:
+		BUG();
+	}
+}
+
+static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
+{
+	unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
+
+	return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
+}
+
+static struct bkey_s_c
+bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+				   enum btree_id btree, struct bpos pos,
+				   unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key_upto_norestart(trans, *iter, btree,
+					  bpos_successor(pos),
+					  SPOS(pos.inode, pos.offset, U32_MAX),
+					  flags|BTREE_ITER_all_snapshots, k, ret)
+		if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
+			return k;
+
+	bch2_trans_iter_exit(trans, iter);
+	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+static struct bkey_s_c
+bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+				    struct bpos pos, unsigned flags)
+{
+	struct bkey_s_c k;
+again:
+	k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
+	if (!k.k ||
+	    bkey_err(k) ||
+	    bkey_is_inode(k.k))
+		return k;
+
+	bch2_trans_iter_exit(trans, iter);
+	pos = k.k->p;
+	goto again;
+}
+
+int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
 {
-	return bkey_inode_flags(k) & BCH_INODE_unlinked;
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key_upto_norestart(trans, iter,
+			BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
+			BTREE_ITER_all_snapshots|
+			BTREE_ITER_with_updates, k, ret)
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
+		    bkey_is_inode(k.k)) {
+			ret = 1;
+			break;
+		}
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int update_inode_has_children(struct btree_trans *trans,
+				     struct bkey_s k,
+				     bool have_child)
+{
+	if (!have_child) {
+		int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+		if (ret)
+			return ret < 0 ? ret : 0;
+	}
+
+	u64 f = bkey_inode_flags(k.s_c);
+	if (have_child != !!(f & BCH_INODE_has_child_snapshot))
+		bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
+
+	return 0;
+}
+
+static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
+					    bool have_child)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
+						&iter, pos, BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k)
+		return 0;
+
+	if (!have_child) {
+		ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+		if (ret) {
+			ret = ret < 0 ? ret : 0;
+			goto err;
+		}
+	}
+
+	u64 f = bkey_inode_flags(k);
+	if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
+		struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
+					     BTREE_UPDATE_internal_snapshot_node);
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
 int bch2_trigger_inode(struct btree_trans *trans,
@@ -586,6 +717,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
 		       struct bkey_s new,
 		       enum btree_iter_update_trigger_flags flags)
 {
+	struct bch_fs *c = trans->c;
+
 	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
 		BUG_ON(!trans->journal_res.seq);
 		bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
@@ -599,13 +732,41 @@ int bch2_trigger_inode(struct btree_trans *trans,
 			return ret;
 	}
 
-	int deleted_delta =	(int) bkey_is_deleted_inode(new.s_c) -
-				(int) bkey_is_deleted_inode(old);
-	if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) {
-		int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
-						      new.k->p, deleted_delta > 0);
-		if (ret)
-			return ret;
+	if (flags & BTREE_TRIGGER_transactional) {
+		int unlinked_delta =	(int) bkey_is_unlinked_inode(new.s_c) -
+					(int) bkey_is_unlinked_inode(old);
+		if (unlinked_delta) {
+			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+							      new.k->p, unlinked_delta > 0);
+			if (ret)
+				return ret;
+		}
+
+		/*
+		 * If we're creating or deleting an inode at this snapshot ID,
+		 * and there might be an inode in a parent snapshot ID, we might
+		 * need to set or clear the has_child_snapshot flag on the
+		 * parent.
+		 */
+		int deleted_delta = (int) bkey_is_inode(new.k) -
+				    (int) bkey_is_inode(old.k);
+		if (deleted_delta &&
+		    bch2_snapshot_parent(c, new.k->p.snapshot)) {
+			int ret = update_parent_inode_has_children(trans, new.k->p,
+								   deleted_delta > 0);
+			if (ret)
+				return ret;
+		}
+
+		/*
+		 * When an inode is first updated in a new snapshot, we may need
+		 * to clear has_child_snapshot
+		 */
+		if (deleted_delta > 0) {
+			int ret = update_inode_has_children(trans, new, false);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return 0;
@@ -888,6 +1049,11 @@ err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
+	if (ret)
+		goto err2;
+
+	ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
+err2:
 	bch2_trans_put(trans);
 	return ret;
 }
@@ -992,7 +1158,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
 	return 0;
 }
 
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter = { NULL };
@@ -1055,6 +1221,44 @@ err:
 	return ret ?: -BCH_ERR_transaction_restart_nested;
 }
 
+/*
+ * After deleting an inode, there may be versions in older snapshots that should
+ * also be deleted - if they're not referenced by sibling snapshots and not open
+ * in other subvolumes:
+ */
+static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+next_parent:
+	ret = lockrestart_do(trans,
+		bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
+	if (ret || !k.k)
+		return ret;
+
+	bool unlinked = bkey_is_unlinked_inode(k);
+	pos = k.k->p;
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!unlinked)
+		return 0;
+
+	if (bch2_inode_is_open(trans->c, pos))
+		return 0;
+
+	ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
+	if (ret)
+		return ret;
+	goto next_parent;
+}
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+	return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
+		delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
+}
+
 static int may_delete_deleted_inode(struct btree_trans *trans,
 				    struct btree_iter *iter,
 				    struct bpos pos,
@@ -1064,6 +1268,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 	struct btree_iter inode_iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked inode;
+	struct printbuf buf = PRINTBUF;
 	int ret;
 
 	k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
@@ -1099,6 +1304,31 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 			pos.offset, pos.snapshot))
 		goto delete;
 
+	if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+			trans, deleted_inode_has_child_snapshots,
+			"inode with child snapshots %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+	if (ret < 0)
+		goto out;
+
+	if (ret) {
+		if (fsck_err(trans, inode_has_child_snapshots_wrong,
+			     "inode has_child_snapshots flag wrong (should be set)\n%s",
+			     (printbuf_reset(&buf),
+			      bch2_inode_unpacked_to_text(&buf, &inode),
+			      buf.buf))) {
+			inode.bi_flags |= BCH_INODE_has_child_snapshot;
+			ret = __bch2_fsck_write_inode(trans, &inode);
+			if (ret)
+				goto out;
+		}
+		goto delete;
+
+	}
+
 	if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
 	    !fsck_err(trans, deleted_inode_but_clean,
 		      "filesystem marked as clean but have deleted inode %llu:%u",
@@ -1107,33 +1337,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
-		struct bpos new_min_pos;
-
-		ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
-		if (ret)
-			goto out;
-
-		inode.bi_flags &= ~BCH_INODE_unlinked;
-
-		ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
-					     BTREE_UPDATE_internal_snapshot_node);
-		bch_err_msg(c, ret, "clearing inode unlinked flag");
-		if (ret)
-			goto out;
-
-		/*
-		 * We'll need another write buffer flush to pick up the new
-		 * unlinked inodes in the snapshot leaves:
-		 */
-		*need_another_pass = true;
-		goto out;
-	}
-
 	ret = 1;
 out:
 fsck_err:
 	bch2_trans_iter_exit(trans, &inode_iter);
+	printbuf_exit(&buf);
 	return ret;
 delete:
 	ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 9c1f67705684..c8e98443e2d4 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -5,6 +5,7 @@
 #include "bkey.h"
 #include "bkey_methods.h"
 #include "opts.h"
+#include "snapshot.h"
 
 enum bch_validate_flags;
 extern const char * const bch2_inode_opts[];
@@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
 			  enum bch_validate_flags);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
+int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
+
+static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+	return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
+		? __bch2_inode_has_child_snapshots(trans, pos)
+		: 0;
+}
+
 int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
 		       struct bkey_s_c, struct bkey_s,
 		       enum btree_iter_update_trigger_flags);
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 83d107331edf..a204e46b6b47 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -133,7 +133,8 @@ enum inode_opt_id {
 	x(i_size_dirty,			5)	\
 	x(i_sectors_dirty,		6)	\
 	x(unlinked,			7)	\
-	x(backptr_untrusted,		8)
+	x(backptr_untrusted,		8)	\
+	x(has_child_snapshot,		9)
 
 /* bits 20+ reserved for packed fields below: */
 
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 5102059a0f1d..ae715ff658e8 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -78,7 +78,10 @@
 	  BCH_FSCK_ERR_accounting_mismatch)			\
 	x(rebalance_work_acct_fix,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch)
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(inode_has_child_snapshots,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
+	  BCH_FSCK_ERR_inode_has_child_snapshots_wrong)
 
 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 4135b1ea2fec..4cdddf15d752 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -225,11 +225,13 @@ enum bch_fsck_flags {
 	x(inode_multiple_links_but_nlink_0,			207,	FSCK_AUTOFIX)	\
 	x(inode_wrong_backpointer,				208,	FSCK_AUTOFIX)	\
 	x(inode_wrong_nlink,					209,	FSCK_AUTOFIX)	\
+	x(inode_has_child_snapshots_wrong,			287,	0)		\
 	x(inode_unreachable,					210,	FSCK_AUTOFIX)	\
 	x(deleted_inode_but_clean,				211,	FSCK_AUTOFIX)	\
 	x(deleted_inode_missing,				212,	FSCK_AUTOFIX)	\
 	x(deleted_inode_is_dir,					213,	FSCK_AUTOFIX)	\
 	x(deleted_inode_not_unlinked,				214,	FSCK_AUTOFIX)	\
+	x(deleted_inode_has_child_snapshots,			288,	FSCK_AUTOFIX)	\
 	x(extent_overlapping,					215,	0)		\
 	x(key_in_missing_inode,					216,	0)		\
 	x(key_in_wrong_inode_type,				217,	0)		\
@@ -298,7 +300,7 @@ enum bch_fsck_flags {
 	x(accounting_key_replicas_devs_unsorted,		280,	FSCK_AUTOFIX)	\
 	x(accounting_key_version_0,				282,	FSCK_AUTOFIX)	\
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
-	x(MAX,							287,	0)
+	x(MAX,							289,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From 84878e82457f2d7900cc70041bd7c05684a6726e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 30 Sep 2024 00:38:13 -0400
Subject: bcachefs: Kill bch2_propagate_key_to_snapshot_leaves()

Dead code now.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 97 --------------------------------------------------
 fs/bcachefs/snapshot.h |  3 --
 2 files changed, 100 deletions(-)

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 1809442b00ee..9f4d13faf7aa 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1732,103 +1732,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 	return ret;
 }
 
-static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *s = snapshot_t(c, id);
-
-	return s->children[1] ?: s->children[0];
-}
-
-static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
-{
-	u32 child;
-
-	while ((child = bch2_snapshot_smallest_child(c, id)))
-		id = child;
-	return id;
-}
-
-static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
-					       enum btree_id btree,
-					       struct bkey_s_c interior_k,
-					       u32 leaf_id, struct bpos *new_min_pos)
-{
-	struct btree_iter iter;
-	struct bpos pos = interior_k.k->p;
-	struct bkey_s_c k;
-	struct bkey_i *new;
-	int ret;
-
-	pos.snapshot = leaf_id;
-
-	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto out;
-
-	/* key already overwritten in this snapshot? */
-	if (k.k->p.snapshot != interior_k.k->p.snapshot)
-		goto out;
-
-	if (bpos_eq(*new_min_pos, POS_MIN)) {
-		*new_min_pos = k.k->p;
-		new_min_pos->snapshot = leaf_id;
-	}
-
-	new = bch2_bkey_make_mut_noupdate(trans, interior_k);
-	ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		goto out;
-
-	new->k.p.snapshot = leaf_id;
-	ret = bch2_trans_update(trans, &iter, new, 0);
-out:
-	bch2_set_btree_iter_dontneed(&iter);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
-					  enum btree_id btree,
-					  struct bkey_s_c k,
-					  struct bpos *new_min_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_buf sk;
-	u32 restart_count = trans->restart_count;
-	int ret = 0;
-
-	bch2_bkey_buf_init(&sk);
-	bch2_bkey_buf_reassemble(&sk, c, k);
-	k = bkey_i_to_s_c(sk.k);
-
-	*new_min_pos = POS_MIN;
-
-	for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
-	     id < k.k->p.snapshot;
-	     id++) {
-		if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
-		    !bch2_snapshot_is_leaf(c, id))
-			continue;
-again:
-		ret =   btree_trans_too_many_iters(trans) ?:
-			bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
-			bch2_trans_commit(trans, NULL, NULL, 0);
-		if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-			bch2_trans_begin(trans);
-			goto again;
-		}
-
-		if (ret)
-			break;
-	}
-
-	bch2_bkey_buf_exit(&sk, c);
-
-	return ret ?: trans_was_restarted(trans, restart_count);
-}
-
 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index eb5ef64221d6..29c94716293e 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -259,9 +259,6 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
 }
 
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
-					  struct bkey_s_c, struct bpos *);
-
 int bch2_snapshots_read(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
 
-- 
cgit v1.2.3


From 9d86178782a25fac105e550e1c29c7d3f8470116 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 2 Oct 2024 21:23:41 -0400
Subject: bcachefs: bch2_inode_or_descendents_is_open()

fsck can now correctly check if inodes in interior snapshot nodes are
open/in use.

- Tweak the vfs inode rhashtable so that the subvolume ID isn't hashed,
  meaning inums in different subvolumes will hash to the same slot. Note
  that this is a hack, and will cause problems if anyone ever has the
  same file in many different snapshots open all at the same time.

- Then check if any of those subvolumes is a descendent of the snapshot
  ID being checked

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c    | 106 ++++++++++++++++++++++++++++++++++++++++++++--------
 fs/bcachefs/fs.h    |   6 +--
 fs/bcachefs/fsck.c  |   7 +++-
 fs/bcachefs/inode.c |   5 ++-
 4 files changed, 103 insertions(+), 21 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 23cae92d313d..e9e32d21f82d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
 	return a.subvol == b.subvol && a.inum == b.inum;
 }
 
+static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
+{
+	const subvol_inum *inum = data;
+
+	return jhash(&inum->inum, sizeof(inum->inum), seed);
+}
+
+static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
+{
+	const struct bch_inode_info *inode = data;
+
+	return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
+}
+
 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
 				 const void *obj)
 {
@@ -170,32 +184,93 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
 	.head_offset		= offsetof(struct bch_inode_info, hash),
 	.key_offset		= offsetof(struct bch_inode_info, ei_inum),
 	.key_len		= sizeof(subvol_inum),
+	.hashfn			= bch2_vfs_inode_hash_fn,
+	.obj_hashfn		= bch2_vfs_inode_obj_hash_fn,
 	.obj_cmpfn		= bch2_vfs_inode_cmp_fn,
 	.automatic_shrinking	= true,
 };
 
-static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
 {
-	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
-}
+	struct bch_fs *c = trans->c;
+	struct rhashtable *ht = &c->vfs_inodes_table;
+	subvol_inum inum = (subvol_inum) { .inum = p.offset };
+	DARRAY(u32) subvols;
+	int ret = 0;
 
-bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
-{
 	if (!test_bit(BCH_FS_started, &c->flags))
 		return false;
 
-	subvol_inum inum = {
-		.subvol = snapshot_t(c, p.snapshot)->subvol,
-		.inum	= p.offset,
-	};
+	darray_init(&subvols);
+restart_from_top:
+
+	/*
+	 * Tweaked version of __rhashtable_lookup(); we need to get a list of
+	 * subvolumes in which the given inode number is open.
+	 *
+	 * For this to work, we don't include the subvolume ID in the key that
+	 * we hash - all inodes with the same inode number regardless of
+	 * subvolume will hash to the same slot.
+	 *
+	 * This will be less than ideal if the same file is ever open
+	 * simultaneously in many different snapshots:
+	 */
+	rcu_read_lock();
+	struct rhash_lock_head __rcu *const *bkt;
+	struct rhash_head *he;
+	unsigned int hash;
+	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+	hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
+	bkt = rht_bucket(tbl, hash);
+	do {
+		struct bch_inode_info *inode;
+
+		rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
+			if (inode->ei_inum.inum == inum.inum) {
+				ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
+						      GFP_NOWAIT|__GFP_NOWARN);
+				if (ret) {
+					rcu_read_unlock();
+					ret = darray_make_room(&subvols, 1);
+					if (ret)
+						goto err;
+					subvols.nr = 0;
+					goto restart_from_top;
+				}
+			}
+		}
+		/* An object might have been moved to a different hash chain,
+		 * while we walk along it - better check and retry.
+		 */
+	} while (he != RHT_NULLS_MARKER(bkt));
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
+		goto restart;
+	rcu_read_unlock();
+
+	darray_for_each(subvols, i) {
+		u32 snap;
+		ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
+		if (ret)
+			goto err;
 
-	/* snapshot tree interior node, can't safely delete while online (yet) */
-	if (!inum.subvol) {
-		bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
-		return true;
+		ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
+		if (ret)
+			break;
 	}
+err:
+	darray_exit(&subvols);
+	return ret;
+}
 
-	return __bch2_inode_hash_find(c, inum) != NULL;
+static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+{
+	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
 }
 
 static void __wait_on_freeing_inode(struct bch_fs *c,
@@ -271,7 +346,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
 
 	set_bit(EI_INODE_HASHED, &inode->ei_flags);
 retry:
-	if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
+	if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
+					&inode->ei_inum,
 					&inode->hash,
 					bch2_vfs_inodes_params))) {
 		old = bch2_inode_hash_find(c, trans, inode->ei_inum);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 40dbd5774d0b..59f9f7ae728d 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -146,6 +146,8 @@ struct bch_inode_info *
 __bch2_create(struct mnt_idmap *, struct bch_inode_info *,
 	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
 
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
+
 int bch2_fs_quota_transfer(struct bch_fs *,
 			   struct bch_inode_info *,
 			   struct bch_qid,
@@ -179,8 +181,6 @@ void bch2_inode_update_after_write(struct btree_trans *,
 int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 				  inode_set_fn, void *, unsigned);
 
-bool bch2_inode_is_open(struct bch_fs *c, struct bpos p);
-
 int bch2_setattr_nonsize(struct mnt_idmap *,
 			 struct bch_inode_info *,
 			 struct iattr *);
@@ -198,7 +198,7 @@ int bch2_vfs_init(void);
 
 #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
 
-static inline bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { return false; }
+static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
 					       snapshot_id_list *s) {}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f00a36f62323..a1087fd292e4 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1213,7 +1213,11 @@ static int check_inode(struct btree_trans *trans,
 			if (ret)
 				goto err;
 		} else {
-			if (fsck_err_on(!bch2_inode_is_open(c, k.k->p),
+			ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
+			if (ret < 0)
+				goto err;
+
+			if (fsck_err_on(!ret,
 					trans, inode_unlinked_and_not_open,
 				      "inode %llu%u unlinked and not open",
 				      u.bi_inum, u.bi_snapshot)) {
@@ -1221,6 +1225,7 @@ static int check_inode(struct btree_trans *trans,
 				bch_err_msg(c, ret, "in fsck deleting inode");
 				goto err_noprint;
 			}
+			ret = 0;
 		}
 	}
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 9d6040d4ba39..2c037e84fbae 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1244,8 +1244,9 @@ next_parent:
 	if (!unlinked)
 		return 0;
 
-	if (bch2_inode_is_open(trans->c, pos))
-		return 0;
+	ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
+	if (ret)
+		return ret < 0 ? ret : 0;
 
 	ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
 	if (ret)
-- 
cgit v1.2.3


From 19773ec997201a45b3a1b5f1171897cba0f30bab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 7 Oct 2024 18:04:21 -0400
Subject: bcachefs: Disk accounting device validation fixes

- Fix failure to validate that accounting replicas entries point to
  valid devices: this wasn't a real bug since they'd be cleaned up by
  GC, but is still something we should know about

- Fix failure to validate that dev_data_type entries point to valid
  devices: this does fix a real bug, since bch2_accounting_read() would
  then try to copy the counters to that device and pop an inconsistent
  error when the device didn't exist

- Remove accounting entries that are zeroed or invalid: if we're not
  validating them we need to get rid of them: they might not exist in
  the superblock, so we need the to trigger the superblock mark path
  when they're readded.

  This fixes the replication.ktest rereplicate test, which was failing
  with "superblock not marked for replicas..."

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c  | 149 +++++++++++++++++++++++++++++++----------
 fs/bcachefs/errcode.h          |   3 +-
 fs/bcachefs/sb-errors_format.h |   3 +-
 3 files changed, 118 insertions(+), 37 deletions(-)

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 9f3133e3e7e5..e309fb78529b 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k)
 		*p = swab64(*p);
 }
 
+static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
+					    struct disk_accounting_pos acc)
+{
+	unsafe_memcpy(r, &acc.replicas,
+		      replicas_entry_bytes(&acc.replicas),
+		      "variable length struct");
+}
+
 static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
 {
 	struct disk_accounting_pos acc_k;
@@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
 
 	switch (acc_k.type) {
 	case BCH_DISK_ACCOUNTING_replicas:
-		unsafe_memcpy(r, &acc_k.replicas,
-			      replicas_entry_bytes(&acc_k.replicas),
-			      "variable length struct");
+		__accounting_to_replicas(r, acc_k);
 		return true;
 	default:
 		return false;
@@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
 	return ret;
 }
 
+static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
+					      struct disk_accounting_pos acc,
+					      u64 *v, unsigned nr)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0, invalid_dev = -1;
+
+	switch (acc.type) {
+	case BCH_DISK_ACCOUNTING_replicas: {
+		struct bch_replicas_padded r;
+		__accounting_to_replicas(&r.e, acc);
+
+		for (unsigned i = 0; i < r.e.nr_devs; i++)
+			if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
+			    !bch2_dev_exists(c, r.e.devs[i])) {
+				invalid_dev = r.e.devs[i];
+				goto invalid_device;
+			}
+
+		/*
+		 * All replicas entry checks except for invalid device are done
+		 * in bch2_accounting_validate
+		 */
+		BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
+
+		if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
+				trans, accounting_replicas_not_marked,
+				"accounting not marked in superblock replicas\n  %s",
+				(printbuf_reset(&buf),
+				 bch2_accounting_key_to_text(&buf, &acc),
+				 buf.buf))) {
+			/*
+			 * We're not RW yet and still single threaded, dropping
+			 * and retaking lock is ok:
+			 */
+			percpu_up_write(&c->mark_lock);
+			ret = bch2_mark_replicas(c, &r.e);
+			if (ret)
+				goto fsck_err;
+			percpu_down_write(&c->mark_lock);
+		}
+		break;
+	}
+
+	case BCH_DISK_ACCOUNTING_dev_data_type:
+		if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
+			invalid_dev = acc.dev_data_type.dev;
+			goto invalid_device;
+		}
+		break;
+	}
+
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+invalid_device:
+	if (fsck_err(trans, accounting_to_invalid_device,
+		     "accounting entry points to invalid device %i\n  %s",
+		     invalid_dev,
+		     (printbuf_reset(&buf),
+		      bch2_accounting_key_to_text(&buf, &acc),
+		      buf.buf))) {
+		for (unsigned i = 0; i < nr; i++)
+			v[i] = -v[i];
+
+		ret = commit_do(trans, NULL, NULL, 0,
+				bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
+			-BCH_ERR_remove_disk_accounting_entry;
+	} else {
+		ret = -BCH_ERR_remove_disk_accounting_entry;
+	}
+	goto fsck_err;
+}
+
 /*
  * At startup time, initialize the in memory accounting from the btree (and
  * journal)
@@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c)
 	}
 	keys->gap = keys->nr = dst - keys->data;
 
-	percpu_down_read(&c->mark_lock);
-	for (unsigned i = 0; i < acc->k.nr; i++) {
-		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-		bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+	percpu_down_write(&c->mark_lock);
+	unsigned i = 0;
+	while (i < acc->k.nr) {
+		unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
 
-		if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
-			continue;
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
 
-		struct bch_replicas_padded r;
-		if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
-			continue;
+		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+		bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
 
 		/*
-		 * If the replicas entry is invalid it'll get cleaned up by
-		 * check_allocations:
+		 * If the entry counters are zeroed, it should be treated as
+		 * nonexistent - it might point to an invalid device.
+		 *
+		 * Remove it, so that if it's re-added it gets re-marked in the
+		 * superblock:
 		 */
-		if (bch2_replicas_entry_validate(&r.e, c, &buf))
+		ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
+			? -BCH_ERR_remove_disk_accounting_entry
+			: bch2_disk_accounting_validate_late(trans, acc_k,
+							v, acc->k.data[idx].nr_counters);
+
+		if (ret == -BCH_ERR_remove_disk_accounting_entry) {
+			free_percpu(acc->k.data[idx].v[0]);
+			free_percpu(acc->k.data[idx].v[1]);
+			darray_remove_item(&acc->k, &acc->k.data[idx]);
+			eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+					accounting_pos_cmp, NULL);
+			ret = 0;
 			continue;
-
-		struct disk_accounting_pos k;
-		bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
-
-		if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
-				trans, accounting_replicas_not_marked,
-				"accounting not marked in superblock replicas\n  %s",
-				(printbuf_reset(&buf),
-				 bch2_accounting_key_to_text(&buf, &k),
-				 buf.buf))) {
-			/*
-			 * We're not RW yet and still single threaded, dropping
-			 * and retaking lock is ok:
-			 */
-			percpu_up_read(&c->mark_lock);
-			ret = bch2_mark_replicas(c, &r.e);
-			if (ret)
-				goto fsck_err;
-			percpu_down_read(&c->mark_lock);
 		}
+
+		if (ret)
+			goto fsck_err;
+		i++;
 	}
 
 	preempt_disable();
@@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c)
 	}
 	preempt_enable();
 fsck_err:
-	percpu_up_read(&c->mark_lock);
+	percpu_up_write(&c->mark_lock);
 err:
 	printbuf_exit(&buf);
 	bch2_trans_put(trans);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 60b7875adada..649263516ab1 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -268,7 +268,8 @@
 	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
 	x(BCH_ERR_nopromote,		nopromote_enomem)			\
 	x(0,				invalid_snapshot_node)			\
-	x(0,				option_needs_open_fs)
+	x(0,				option_needs_open_fs)			\
+	x(0,				remove_disk_accounting_entry)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 4cdddf15d752..62ad25f91b78 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -291,6 +291,7 @@ enum bch_fsck_flags {
 	x(alloc_key_stripe_sectors_wrong,			271,	FSCK_AUTOFIX)	\
 	x(accounting_mismatch,					272,	FSCK_AUTOFIX)	\
 	x(accounting_replicas_not_marked,			273,	0)		\
+	x(accounting_to_invalid_device,				289,	0)		\
 	x(invalid_btree_id,					274,	0)		\
 	x(alloc_key_io_time_bad,				275,	0)		\
 	x(alloc_key_fragmentation_lru_wrong,			276,	FSCK_AUTOFIX)	\
@@ -300,7 +301,7 @@ enum bch_fsck_flags {
 	x(accounting_key_replicas_devs_unsorted,		280,	FSCK_AUTOFIX)	\
 	x(accounting_key_version_0,				282,	FSCK_AUTOFIX)	\
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
-	x(MAX,							289,	0)
+	x(MAX,							290,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From 0151d10a480da22d0c137f859645fdc79e10b551 Mon Sep 17 00:00:00 2001
From: Piotr Zalewski <pZ010001011111@proton.me>
Date: Sun, 29 Sep 2024 14:26:45 +0000
Subject: bcachefs: add check for btree id against max in try read node

Add check for read node's btree_id against BTREE_ID_NR_MAX in
try_read_btree_node to prevent triggering EBUG_ON condition in
bch2_btree_id_root[1].

[1] https://syzkaller.appspot.com/bug?extid=cf7b2215b5d70600ec00

Reported-by: syzbot+cf7b2215b5d70600ec00@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=cf7b2215b5d70600ec00
Fixes: 4409b8081d16 ("bcachefs: Repair pass for scanning for btree nodes")
Signed-off-by: Piotr Zalewski <pZ010001011111@proton.me>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_node_scan.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 1e694fedc5da..a7aedb134e9f 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
 		return;
 
+	if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
+		return;
+
 	rcu_read_lock();
 	struct found_btree_node n = {
 		.btree_id	= BTREE_NODE_ID(bn),
-- 
cgit v1.2.3


From a1541541480073365ac701a1125d750a86bfd644 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Wed, 9 Oct 2024 00:59:08 +0800
Subject: bcachefs: Release transaction before wake up

We will get this if we wake up first:

Kernel panic - not syncing: btree_node_write_done leaked btree_trans

since there are still transactions waiting for cycle detectors after
BTREE_NODE_write_in_flight is cleared.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1c1448b52207..cf933409d385 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1838,10 +1838,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 	struct btree_trans *trans = bch2_trans_get(c);
 
 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-	__btree_node_write_done(c, b);
-	six_unlock_read(&b->c.lock);
 
+	/* we don't need transaction context anymore after we got the lock. */
 	bch2_trans_put(trans);
+	__btree_node_write_done(c, b);
+	six_unlock_read(&b->c.lock);
 }
 
 static void btree_node_write_work(struct work_struct *work)
-- 
cgit v1.2.3


From a30f32222df2a3777f8c09ef460ae8ad278021e6 Mon Sep 17 00:00:00 2001
From: Mohammed Anees <pvmohammedanees2003@gmail.com>
Date: Sat, 5 Oct 2024 18:32:29 +0530
Subject: bcachefs: Fix NULL pointer dereference in bch2_opt_to_text

This patch adds a bounds check to the bch2_opt_to_text function to prevent
NULL pointer dereferences when accessing the opt->choices array. This
ensures that the index used is within valid bounds before dereferencing.
The new version enhances the readability.

Reported-and-tested-by: syzbot+37186860aa7812b331d5@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=37186860aa7812b331d5
Signed-off-by: Mohammed Anees <pvmohammedanees2003@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 232be8a44051..84097235eea9 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -427,7 +427,9 @@ void bch2_opt_to_text(struct printbuf *out,
 			prt_printf(out, "%lli", v);
 		break;
 	case BCH_OPT_STR:
-		if (flags & OPT_SHOW_FULL_LIST)
+		if (v < opt->min || v >= opt->max - 1)
+			prt_printf(out, "(invalid option %lli)", v);
+		else if (flags & OPT_SHOW_FULL_LIST)
 			prt_string_option(out, opt->choices, v);
 		else
 			prt_str(out, opt->choices[v]);
-- 
cgit v1.2.3


From 9205d24cf7714dd2b5fa09a22964dbcdeed6c430 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Wed, 9 Oct 2024 01:33:05 +0800
Subject: bcachefs: Fix state lock involved deadlock

We increased write ref, if the fs went to RO, that would lead to
a deadlock, it actually happens:

00171 ========= TEST   generic/279
00171
00172 bcachefs (vdb): starting version 1.12: rebalance_work_acct_fix opts=nocow
00172 bcachefs (vdb): recovering from clean shutdown, journal seq 35
00172 bcachefs (vdb): accounting_read... done
00172 bcachefs (vdb): alloc_read... done
00172 bcachefs (vdb): stripes_read... done
00172 bcachefs (vdb): snapshots_read... done
00172 bcachefs (vdb): journal_replay... done
00172 bcachefs (vdb): resume_logged_ops... done
00172 bcachefs (vdb): going read-write
00172 bcachefs (vdb): done starting filesystem
00172 FSTYP         -- bcachefs
00172 PLATFORM      -- Linux/aarch64 farm3-kvm 6.11.0-rc1-ktest-g3e290a0b8e34 #7030 SMP Tue Oct  8 14:15:12 UTC 2024
00172 MKFS_OPTIONS  -- --nocow /dev/vdc
00172 MOUNT_OPTIONS -- /dev/vdc /mnt/scratch
00172
00172 bcachefs (vdc): starting version 1.12: rebalance_work_acct_fix opts=nocow
00172 bcachefs (vdc): initializing new filesystem
00172 bcachefs (vdc): going read-write
00172 bcachefs (vdc): marking superblocks
00172 bcachefs (vdc): initializing freespace
00172 bcachefs (vdc): done initializing freespace
00172 bcachefs (vdc): reading snapshots table
00172 bcachefs (vdc): reading snapshots done
00172 bcachefs (vdc): done starting filesystem
00173 bcachefs (vdc): shutting down
00173 bcachefs (vdc): going read-only
00173 bcachefs (vdc): finished waiting for writes to stop
00173 bcachefs (vdc): flushing journal and stopping allocators, journal seq 4
00173 bcachefs (vdc): flushing journal and stopping allocators complete, journal seq 6
00173 bcachefs (vdc): shutdown complete, journal seq 7
00173 bcachefs (vdc): marking filesystem clean
00173 bcachefs (vdc): shutdown complete
00173 bcachefs (vdb): shutting down
00173 bcachefs (vdb): going read-only
00361 INFO: task umount:6180 blocked for more than 122 seconds.
00361 Not tainted 6.11.0-rc1-ktest-g3e290a0b8e34 #7030
00361 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
00361 task:umount          state:D stack:0     pid:6180  tgid:6180  ppid:6176   flags:0x00000004
00361 Call trace:
00362 __switch_to (arch/arm64/kernel/process.c:556)
00362 __schedule (kernel/sched/core.c:5191 kernel/sched/core.c:6529)
00363 schedule (include/asm-generic/bitops/generic-non-atomic.h:128 include/linux/thread_info.h:192 include/linux/sched.h:2084 kernel/sched/core.c:6608 kernel/sched/core.c:6621)
00365 bch2_fs_read_only (fs/bcachefs/super.c:346 (discriminator 41))
00367 __bch2_fs_stop (fs/bcachefs/super.c:620)
00368 bch2_put_super (fs/bcachefs/fs.c:1942)
00369 generic_shutdown_super (include/linux/list.h:373 (discriminator 2) fs/super.c:650 (discriminator 2))
00371 bch2_kill_sb (fs/bcachefs/fs.c:2170)
00372 deactivate_locked_super (fs/super.c:434 fs/super.c:475)
00373 deactivate_super (fs/super.c:508)
00374 cleanup_mnt (fs/namespace.c:250 fs/namespace.c:1374)
00376 __cleanup_mnt (fs/namespace.c:1381)
00376 task_work_run (include/linux/sched.h:2024 kernel/task_work.c:224)
00377 do_notify_resume (include/linux/resume_user_mode.h:50 arch/arm64/kernel/entry-common.c:151)
00377 el0_svc (arch/arm64/include/asm/daifflags.h:28 arch/arm64/kernel/entry-common.c:171 arch/arm64/kernel/entry-common.c:178 arch/arm64/kernel/entry-common.c:713)
00377 el0t_64_sync_handler (arch/arm64/kernel/entry-common.c:731)
00378 el0t_64_sync (arch/arm64/kernel/entry.S:598)
00378 INFO: task tee:6182 blocked for more than 122 seconds.
00378 Not tainted 6.11.0-rc1-ktest-g3e290a0b8e34 #7030
00378 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
00378 task:tee             state:D stack:0     pid:6182  tgid:6182  ppid:533    flags:0x00000004
00378 Call trace:
00378 __switch_to (arch/arm64/kernel/process.c:556)
00378 __schedule (kernel/sched/core.c:5191 kernel/sched/core.c:6529)
00378 schedule (include/asm-generic/bitops/generic-non-atomic.h:128 include/linux/thread_info.h:192 include/linux/sched.h:2084 kernel/sched/core.c:6608 kernel/sched/core.c:6621)
00378 schedule_preempt_disabled (kernel/sched/core.c:6680)
00379 rwsem_down_read_slowpath (kernel/locking/rwsem.c:1073 (discriminator 1))
00379 down_read (kernel/locking/rwsem.c:1529)
00381 bch2_gc_gens (fs/bcachefs/sb-members.h:77 fs/bcachefs/sb-members.h:88 fs/bcachefs/sb-members.h:128 fs/bcachefs/btree_gc.c:1240)
00383 bch2_fs_store_inner (fs/bcachefs/sysfs.c:473)
00385 bch2_fs_internal_store (fs/bcachefs/sysfs.c:417 fs/bcachefs/sysfs.c:580 fs/bcachefs/sysfs.c:576)
00386 sysfs_kf_write (fs/sysfs/file.c:137)
00387 kernfs_fop_write_iter (fs/kernfs/file.c:334)
00389 vfs_write (fs/read_write.c:497 fs/read_write.c:590)
00390 ksys_write (fs/read_write.c:643)
00391 __arm64_sys_write (fs/read_write.c:652)
00391 invoke_syscall.constprop.0 (arch/arm64/include/asm/syscall.h:61 arch/arm64/kernel/syscall.c:54)
00392 do_el0_svc (include/linux/thread_info.h:127 (discriminator 2) arch/arm64/kernel/syscall.c:140 (discriminator 2) arch/arm64/kernel/syscall.c:151 (discriminator 2))
00392 el0_svc (arch/arm64/include/asm/irqflags.h:55 arch/arm64/include/asm/irqflags.h:76 arch/arm64/kernel/entry-common.c:165 arch/arm64/kernel/entry-common.c:178 arch/arm64/kernel/entry-common.c:713)
00392 el0t_64_sync_handler (arch/arm64/kernel/entry-common.c:731)
00392 el0t_64_sync (arch/arm64/kernel/entry.S:598)

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 771154e3a291..94bbd8505582 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1224,17 +1224,20 @@ int bch2_gc_gens(struct bch_fs *c)
 	u64 b, start_time = local_clock();
 	int ret;
 
-	/*
-	 * Ideally we would be using state_lock and not gc_gens_lock here, but that
-	 * introduces a deadlock in the RO path - we currently take the state
-	 * lock at the start of going RO, thus the gc thread may get stuck:
-	 */
 	if (!mutex_trylock(&c->gc_gens_lock))
 		return 0;
 
 	trace_and_count(c, gc_gens_start, c);
 
-	down_read(&c->state_lock);
+	/*
+	 * We have to use trylock here. Otherwise, we would
+	 * introduce a deadlock in the RO path - we take the
+	 * state lock at the start of going RO.
+	 */
+	if (!down_read_trylock(&c->state_lock)) {
+		mutex_unlock(&c->gc_gens_lock);
+		return 0;
+	}
 
 	for_each_member_device(c, ca) {
 		struct bucket_gens *gens = bucket_gens(ca);
-- 
cgit v1.2.3


From 04b670de2859a8a8b0830779f9c9bda5d39662ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 7 Oct 2024 16:54:11 -0400
Subject: closures: Add closure_wait_event_timeout()

Add a closure version of wait_event_timeout(), with the same semantics.

The closure version is useful because unlike wait_event(), it allows
blocking code to run in the conditional expression.

Cc: Coly Li <colyli@suse.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/closure.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/include/linux/closure.h b/include/linux/closure.h
index 2af44427107d..880fe85e35e9 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -454,4 +454,39 @@ do {									\
 		__closure_wait_event(waitlist, _cond);			\
 } while (0)
 
+#define __closure_wait_event_timeout(waitlist, _cond, _until)		\
+({									\
+	struct closure cl;						\
+	long _t;							\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		closure_wait(waitlist, &cl);				\
+		if (_cond) {						\
+			_t = max_t(long, 1L, _until - jiffies);		\
+			break;						\
+		}							\
+		_t = max_t(long, 0L, _until - jiffies);			\
+		if (!_t)						\
+			break;						\
+		closure_sync_timeout(&cl, _t);				\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+	_t;								\
+})
+
+/*
+ * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if
+ * condition became true
+ */
+#define closure_wait_event_timeout(waitlist, _cond, _timeout)		\
+({									\
+	unsigned long _until = jiffies + _timeout;			\
+	(_cond)								\
+		? max_t(long, 1L, _until - jiffies)			\
+		: __closure_wait_event_timeout(waitlist, _cond, _until);\
+})
+
 #endif /* _LINUX_CLOSURE_H */
-- 
cgit v1.2.3


From a7e2dd58fbbd8882557135b9433192e038fdb88e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 7 Oct 2024 16:55:34 -0400
Subject: bcachefs: Check if stuck in journal_res_get()

Like how we already do when the allocator seems to be stuck, check if
we're waiting too long for a journal reservation and print some debug
info.

This is specifically to track down
https://github.com/koverstreet/bcachefs/issues/656

which is showing up in userspace where we don't have sysfs/debugfs to
get the journal debug info.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f5f7db50ca31..dc099f06341f 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -603,6 +603,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 {
 	int ret;
 
+	if (closure_wait_event_timeout(&j->async_wait,
+		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK),
+		   HZ * 10))
+		return ret;
+
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct printbuf buf = PRINTBUF;
+	bch2_journal_debug_to_text(&buf, j);
+	bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
+		buf.buf);
+	printbuf_exit(&buf);
+
 	closure_wait_event(&j->async_wait,
 		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
 		   (flags & JOURNAL_RES_GET_NONBLOCK));
-- 
cgit v1.2.3


From 3b80552e70573764bbf38b89c58749aef9dd8753 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 9 Oct 2024 16:21:00 -0400
Subject: bcachefs: __wait_for_freeing_inode: Switch to wait_bit_queue_entry

inode_bit_waitqueue() is changing - this update clears the way for
sched changes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index e9e32d21f82d..857175f4bbbe 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -278,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c,
 				    subvol_inum inum)
 {
 	wait_queue_head_t *wq;
-	DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+	struct wait_bit_queue_entry wait;
+
 	wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&inode->v.i_lock);
-- 
cgit v1.2.3


From 7d84d9f449f929a3b010cab04a0adc97c7853ab7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 11 Oct 2024 22:00:44 -0400
Subject: bcachefs: Fix bch2_have_enough_devs() for BCH_SB_MEMBER_INVALID

This fixes a kasan splat in the ec device removal tests.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 797da1032318..477ef0997949 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -821,6 +821,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 
 		rcu_read_lock();
 		for (unsigned i = 0; i < e->nr_devs; i++) {
+			if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
+				nr_failed++;
+				continue;
+			}
+
 			nr_online += test_bit(e->devs[i], devs.d);
 
 			struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
-- 
cgit v1.2.3


From c1bd21bb65182afca490db4031890802a1a546b7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 11 Oct 2024 22:06:58 -0400
Subject: bcachefs: Fix invalid shift in member_to_text()

Reported-by: syzbot+064ce437a1ad63d3f6ef@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-members.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 02bcde3c1b02..fb08dd680dac 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -163,6 +163,11 @@ static int validate_member(struct printbuf *err,
 		return -BCH_ERR_invalid_sb_members;
 	}
 
+	if (m.btree_bitmap_shift >= 64) {
+		prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
 	return 0;
 }
 
@@ -247,7 +252,10 @@ static void member_to_text(struct printbuf *out,
 	prt_newline(out);
 
 	prt_printf(out, "Btree allocated bitmap blocksize:\t");
-	prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
+	if (m.btree_bitmap_shift < 64)
+		prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
+	else
+		prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
 	prt_newline(out);
 
 	prt_printf(out, "Btree allocated bitmap:\t");
-- 
cgit v1.2.3


From 672f75238e54a642d3ccbcb62d3977383ab50bb0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Oct 2024 02:44:38 -0400
Subject: bcachefs: Fix accounting replay flags

BCH_TRANS_COMMIT_journal_reclaim without BCH_WATERMARK_reclaim means
"return an error if low on journal space" - but accounting replay must
succeed.

Fixes https://github.com/koverstreet/bcachefs/issues/656

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6db72d3bad7d..55e1504a8130 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -287,7 +287,8 @@ int bch2_journal_replay(struct bch_fs *c)
 				BCH_TRANS_COMMIT_no_enospc|
 				BCH_TRANS_COMMIT_journal_reclaim|
 				BCH_TRANS_COMMIT_skip_accounting_apply|
-				BCH_TRANS_COMMIT_no_journal_res,
+				BCH_TRANS_COMMIT_no_journal_res|
+				BCH_WATERMARK_reclaim,
 			     bch2_journal_replay_accounting_key(trans, k));
 		if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
 			goto err;
-- 
cgit v1.2.3


From 9183c2b11e93f16dfd89a7fa09ec50c0be71f15a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Oct 2024 05:00:26 -0400
Subject: bcachefs: Fix bkey_nocow_lock()

This fixes an assertion pop in nocow_locking.c

00243 kernel BUG at fs/bcachefs/nocow_locking.c:41!
00243 Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
00243 Modules linked in:
00243 Hardware name: linux,dummy-virt (DT)
00243 pstate: 60001005 (nZCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
00244 pc : bch2_bucket_nocow_unlock (/home/testdashboard/linux-7/fs/bcachefs/nocow_locking.c:41)
00244 lr : bkey_nocow_lock (/home/testdashboard/linux-7/fs/bcachefs/data_update.c:79)
00244 sp : ffffff80c82373b0
00244 x29: ffffff80c82373b0 x28: ffffff80e08958c0 x27: ffffff80e0880000
00244 x26: ffffff80c8237a98 x25: 00000000000000a0 x24: ffffff80c8237ab0
00244 x23: 00000000000000c0 x22: 0000000000000008 x21: 0000000000000000
00244 x20: ffffff80c8237a98 x19: 0000000000000018 x18: 0000000000000000
00244 x17: 0000000000000000 x16: 000000000000003f x15: 0000000000000000
00244 x14: 0000000000000008 x13: 0000000000000018 x12: 0000000000000000
00244 x11: 0000000000000000 x10: ffffff80e0880000 x9 : ffffffc0803ac1a4
00244 x8 : 0000000000000018 x7 : ffffff80c8237a88 x6 : ffffff80c8237ab0
00244 x5 : ffffff80e08988d0 x4 : 00000000ffffffff x3 : 0000000000000000
00244 x2 : 0000000000000004 x1 : 0003000000000d1e x0 : ffffff80e08988c0
00244 Call trace:
00244 bch2_bucket_nocow_unlock (/home/testdashboard/linux-7/fs/bcachefs/nocow_locking.c:41)
00245 bch2_data_update_init (/home/testdashboard/linux-7/fs/bcachefs/data_update.c:627 (discriminator 1))
00245 promote_alloc.isra.0 (/home/testdashboard/linux-7/fs/bcachefs/io_read.c:242 /home/testdashboard/linux-7/fs/bcachefs/io_read.c:304)
00245 __bch2_read_extent (/home/testdashboard/linux-7/fs/bcachefs/io_read.c:949)
00246 __bch2_read (/home/testdashboard/linux-7/fs/bcachefs/io_read.c:1215)
00246 bch2_direct_IO_read (/home/testdashboard/linux-7/fs/bcachefs/fs-io-direct.c:132)
00246 bch2_read_iter (/home/testdashboard/linux-7/fs/bcachefs/fs-io-direct.c:201)
00247 aio_read.constprop.0 (/home/testdashboard/linux-7/fs/aio.c:1602)
00247 io_submit_one.constprop.0 (/home/testdashboard/linux-7/fs/aio.c:2003 /home/testdashboard/linux-7/fs/aio.c:2052)
00248 __arm64_sys_io_submit (/home/testdashboard/linux-7/fs/aio.c:2111 /home/testdashboard/linux-7/fs/aio.c:2081 /home/testdashboard/linux-7/fs/aio.c:2081)
00248 invoke_syscall.constprop.0 (/home/testdashboard/linux-7/arch/arm64/include/asm/syscall.h:61 /home/testdashboard/linux-7/arch/arm64/kernel/syscall.c:54)
00248 ========= FAILED TIMEOUT tiering_variable_buckets_replicas in 1200s

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 462b1a2fe1ad..a6ee0beee6b0 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
 					if (ptr2 == ptr)
 						break;
 
+					ca = bch2_dev_have_ref(c, ptr2->dev);
 					bucket = PTR_BUCKET_POS(ca, ptr2);
 					bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
 				}
-- 
cgit v1.2.3


From c986dd7ecba185ad2a36b0815940f34deb2a8170 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 9 Oct 2024 21:28:11 -0400
Subject: bcachefs: Improve check_snapshot_exists()

Check if we have snapshot_trees or subvolumes that refer to the snapshot
node being reconstructed, and use them.

With this, the kill_btree_root test that blows away the snapshots btree
now passes, and we're able to successfully reconstruct.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 9f4d13faf7aa..ae57638506c3 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -905,12 +905,30 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
 	if (bch2_snapshot_equiv(c, id))
 		return 0;
 
-	/* 0 is an invalid tree ID */
+	/* Do we need to reconstruct the snapshot_tree entry as well? */
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
 	u32 tree_id = 0;
-	int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
+				     0, k, ret) {
+		if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
+			tree_id = k.k->p.offset;
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
 	if (ret)
 		return ret;
 
+	if (!tree_id) {
+		ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+		if (ret)
+			return ret;
+	}
+
 	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
 	ret = PTR_ERR_OR_ZERO(snapshot);
 	if (ret)
@@ -921,6 +939,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
 	snapshot->v.tree	= cpu_to_le32(tree_id);
 	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
 
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+				     0, k, ret) {
+		if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
+			snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
+			SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
 	return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
 		bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
 				   bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
-- 
cgit v1.2.3


From 573ddcdc56077615f8db045cd06b44dd8fc01f4b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Oct 2024 15:38:33 -0400
Subject: bcachefs: fix uaf in bch2_dio_write_done()

Reported-by: syzbot+19ad84d5133871207377@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-direct.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index ee1c0325f313..6d3a05ae5da8 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -369,6 +369,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio)
 
 static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 {
+	struct bch_fs *c = dio->op.c;
 	struct kiocb *req = dio->req;
 	struct bch_inode_info *inode = dio->inode;
 	bool sync = dio->sync;
@@ -387,7 +388,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 	ret = dio->op.error ?: ((long) dio->written << 9);
 	bio_put(&dio->op.wbio.bio);
 
-	bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+	bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
 
 	/* inode->i_dio_count is our ref on inode and thus bch_fs */
 	inode_dio_end(&inode->v);
-- 
cgit v1.2.3


From a319aeaebb6c7d8e68125943d454031b27d6b3ef Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Oct 2024 15:49:23 -0400
Subject: bcachefs: Fix missing bounds checks in bch2_alloc_read()

We were checking that the alloc key was for a valid device, but not a
valid bucket.

This is the upgrade path from versions prior to bcachefs being mainlined.

Reported-by: syzbot+a1b59c8e1a3f022fd301@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4e4a448f6931..6e161f8ffe8d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -639,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c)
 				continue;
 			}
 
+			if (k.k->p.offset < ca->mi.first_bucket) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
+				continue;
+			}
+
+			if (k.k->p.offset >= ca->mi.nbuckets) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+				continue;
+			}
+
 			struct bch_alloc_v4 a;
 			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
 			0;
-- 
cgit v1.2.3


From 9f25dbe0bf91108be38c690628910c3ee85f6c1c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Oct 2024 17:03:30 -0400
Subject: bcachefs: Add missing validation for bch_stripe.csum_granularity_bits

Reported-by: syzbot+f8c98a50c323635be65d@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c               | 11 ++++++++++-
 fs/bcachefs/sb-errors_format.h |  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1587c6e1866a..b29838083f1f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -124,6 +124,11 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
 			 "incorrect value size (%zu < %u)",
 			 bkey_val_u64s(k.k), stripe_val_u64s(s));
 
+	bkey_fsck_err_on(s->csum_granularity_bits >= 64,
+			 c, stripe_csum_granularity_bad,
+			 "invalid csum granularity (%u >= 64)",
+			 s->csum_granularity_bits);
+
 	ret = bch2_bkey_ptrs_validate(c, k, flags);
 fsck_err:
 	return ret;
@@ -145,7 +150,11 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 		   nr_data,
 		   s.nr_redundant);
 	bch2_prt_csum_type(out, s.csum_type);
-	prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+	prt_str(out, " gran ");
+	if (s.csum_granularity_bits < 64)
+		prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
+	else
+		prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
 
 	if (s.disk_label) {
 		prt_str(out, " label");
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 62ad25f91b78..aab328ac6dfa 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -180,6 +180,7 @@ enum bch_fsck_flags {
 	x(reflink_p_to_missing_reflink_v,			166,	0)		\
 	x(stripe_pos_bad,					167,	0)		\
 	x(stripe_val_size_bad,					168,	0)		\
+	x(stripe_csum_granularity_bad,				290,	0)		\
 	x(stripe_sector_count_wrong,				169,	0)		\
 	x(snapshot_tree_pos_bad,				170,	0)		\
 	x(snapshot_tree_to_missing_snapshot,			171,	0)		\
@@ -301,7 +302,7 @@ enum bch_fsck_flags {
 	x(accounting_key_replicas_devs_unsorted,		280,	FSCK_AUTOFIX)	\
 	x(accounting_key_version_0,				282,	FSCK_AUTOFIX)	\
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
-	x(MAX,							290,	0)
+	x(MAX,							291,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From b1e562265ed801c36e8817c67c97a47137870ece Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Oct 2024 20:16:45 -0400
Subject: bcachefs: Fix kasan splat in new_stripe_alloc_buckets()

Update for BCH_SB_MEMBER_INVALID.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b29838083f1f..668d013cc97f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1885,7 +1885,15 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 	bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
 
 	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
-		__clear_bit(v->ptrs[i].dev, devs.d);
+		/*
+		 * Note: we don't yet repair invalid blocks (failed/removed
+		 * devices) when reusing stripes - we still need a codepath to
+		 * walk backpointers and update all extents that point to that
+		 * block when updating the stripe
+		 */
+		if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
+			__clear_bit(v->ptrs[i].dev, devs.d);
+
 		if (i < h->s->nr_data)
 			nr_have_data++;
 		else
-- 
cgit v1.2.3


From cb6055e66faa0d4c15587ef67be18bc199aef085 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Oct 2024 19:38:00 -0400
Subject: bcachefs: Handle race between stripe reuse, invalidate_stripe_to_dev

When creating a new stripe, we may reuse an existing stripe that has
some empty and some nonempty blocks.

Generally, the existing stripe won't change underneath us - except for
block sector counts, which we copy to the new key in
ec_stripe_key_update.

But the device removal path can now invalidate stripe pointers to a
device, and that can race with stripe reuse.

Change ec_stripe_key_update() to check for and resolve this
inconsistency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c      | 73 +++++++++++++++++++++++++++++++--------------------
 fs/bcachefs/extents.h | 10 +++++++
 2 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 668d013cc97f..e410cfe37b1a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1206,47 +1206,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c)
 /* stripe creation: */
 
 static int ec_stripe_key_update(struct btree_trans *trans,
-				struct bkey_i_stripe *new,
-				bool create)
+				struct bkey_i_stripe *old,
+				struct bkey_i_stripe *new)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
+	bool create = !old;
 
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
-			       new->k.p, BTREE_ITER_intent);
-	ret = bkey_err(k);
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+					       new->k.p, BTREE_ITER_intent);
+	int ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
-		bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
-				     create ? "creating" : "updating",
-				     bch2_bkey_types[k.k->type]);
+	if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
+				    c, "error %s stripe: got existing key type %s",
+				    create ? "creating" : "updating",
+				    bch2_bkey_types[k.k->type])) {
 		ret = -EINVAL;
 		goto err;
 	}
 
 	if (k.k->type == KEY_TYPE_stripe) {
-		const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
-		unsigned i;
+		const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
 
-		if (old->nr_blocks != new->v.nr_blocks) {
-			bch_err(c, "error updating stripe: nr_blocks does not match");
-			ret = -EINVAL;
-			goto err;
-		}
+		BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
+		BUG_ON(old->v.nr_blocks != v->nr_blocks);
+
+		for (unsigned i = 0; i < new->v.nr_blocks; i++) {
+			unsigned sectors = stripe_blockcount_get(v, i);
+
+			if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
+				struct printbuf buf = PRINTBUF;
 
-		for (i = 0; i < new->v.nr_blocks; i++) {
-			unsigned v = stripe_blockcount_get(old, i);
+				prt_printf(&buf, "stripe changed nonempty block %u", i);
+				prt_str(&buf, "\nold: ");
+				bch2_bkey_val_to_text(&buf, c, k);
+				prt_str(&buf, "\nnew: ");
+				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
+				bch2_fs_inconsistent(c, "%s", buf.buf);
+				printbuf_exit(&buf);
+				ret = -EINVAL;
+				goto err;
+			}
 
-			BUG_ON(v &&
-			       (old->ptrs[i].dev != new->v.ptrs[i].dev ||
-				old->ptrs[i].gen != new->v.ptrs[i].gen ||
-				old->ptrs[i].offset != new->v.ptrs[i].offset));
+			/*
+			 * If the stripe ptr changed underneath us, it must have
+			 * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
+			 */
+			if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
+				BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
+
+				if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
+					new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
+			}
 
-			stripe_blockcount_set(&new->v, i, v);
+			stripe_blockcount_set(&new->v, i, sectors);
 		}
 	}
 
@@ -1508,8 +1523,10 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			    BCH_TRANS_COMMIT_no_check_rw|
 			    BCH_TRANS_COMMIT_no_enospc,
 			    ec_stripe_key_update(trans,
-					bkey_i_to_stripe(&s->new_stripe.key),
-					!s->have_existing_stripe));
+					s->have_existing_stripe
+					? bkey_i_to_stripe(&s->existing_stripe.key)
+					: NULL,
+					bkey_i_to_stripe(&s->new_stripe.key)));
 	bch_err_msg(c, ret, "creating stripe key");
 	if (ret) {
 		goto err;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ed5001dd662e..923a5f1849a8 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -695,6 +695,16 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
 			    enum bch_validate_flags);
 
+static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
+				      struct bch_extent_ptr ptr2)
+{
+	return (ptr1.cached	== ptr2.cached &&
+		ptr1.unwritten	== ptr2.unwritten &&
+		ptr1.offset	== ptr2.offset &&
+		ptr1.dev	== ptr2.dev &&
+		ptr1.dev	== ptr2.dev);
+}
+
 void bch2_ptr_swab(struct bkey_s);
 
 const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
-- 
cgit v1.2.3


From 5e3b72324d32629fa013f86657308f3dbc1115e1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Oct 2024 14:36:38 -0400
Subject: bcachefs: Fix sysfs warning in fstests generic/730,731

sysfs warns if we're removing a symlink from a directory that's no
longer in sysfs; this is triggered by fstests generic/730, which
simulates hot removal of a block device.

This patch is however not a correct fix, since checking
kobj->state_in_sysfs on a kobj owned by another subsystem is racy.

A better fix would be to add the appropriate check to
sysfs_remove_link() - and sysfs_create_link() as well.

But kobject_add_internal()/kobject_del() do not as of today have locking
that would support that.

Note that the block/holder.c code appears to be subject to this race as
well.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc:  Christoph Hellwig <hch@lst.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 873e4be7e1dc..77d811a539af 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -184,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock);
 
 DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
 
+static void bch2_dev_unlink(struct bch_dev *);
 static void bch2_dev_free(struct bch_dev *);
 static int bch2_dev_alloc(struct bch_fs *, unsigned);
 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
@@ -620,9 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c)
 	up_write(&c->state_lock);
 
 	for_each_member_device(c, ca)
-		if (ca->kobj.state_in_sysfs &&
-		    ca->disk_sb.bdev)
-			sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+		bch2_dev_unlink(ca);
 
 	if (c->kobj.state_in_sysfs)
 		kobject_del(&c->kobj);
@@ -1187,9 +1186,7 @@ static void bch2_dev_free(struct bch_dev *ca)
 {
 	cancel_work_sync(&ca->io_error_work);
 
-	if (ca->kobj.state_in_sysfs &&
-	    ca->disk_sb.bdev)
-		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+	bch2_dev_unlink(ca);
 
 	if (ca->kobj.state_in_sysfs)
 		kobject_del(&ca->kobj);
@@ -1226,10 +1223,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
 	percpu_ref_kill(&ca->io_ref);
 	wait_for_completion(&ca->io_ref_completion);
 
-	if (ca->kobj.state_in_sysfs) {
-		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
-		sysfs_remove_link(&ca->kobj, "block");
-	}
+	bch2_dev_unlink(ca);
 
 	bch2_free_super(&ca->disk_sb);
 	bch2_dev_journal_exit(ca);
@@ -1251,6 +1245,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
 	complete(&ca->io_ref_completion);
 }
 
+static void bch2_dev_unlink(struct bch_dev *ca)
+{
+	struct kobject *b;
+
+	/*
+	 * This is racy w.r.t. the underlying block device being hot-removed,
+	 * which removes it from sysfs.
+	 *
+	 * It'd be lovely if we had a way to handle this race, but the sysfs
+	 * code doesn't appear to provide a good method and block/holder.c is
+	 * susceptible as well:
+	 */
+	if (ca->kobj.state_in_sysfs &&
+	    ca->disk_sb.bdev &&
+	    (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
+		sysfs_remove_link(b, "bcachefs");
+		sysfs_remove_link(&ca->kobj, "block");
+	}
+}
+
 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
 {
 	int ret;
-- 
cgit v1.2.3