ceph: when filling trace, call ceph_get_inode outside of mutexes

Geng Jichao reported a rather complex deadlock involving several moving parts: 1) readahead is issued against an inode and some of its pages are locked while the read is in flight 2) the same inode is evicted from the cache, and this task gets stuck waiting for the page lock because of the above readahead 3) another task is processing a reply trace, and looks up the inode being evicted while holding the s_mutex. That ends up waiting for the eviction to complete 4) a write reply for an unrelated inode is then processed in the ceph_con_workfn job. It calls ceph_check_caps after putting wrbuffer caps, and that gets stuck waiting on the s_mutex held by 3. The reply to "1" is stuck behind the write reply in "4", so we deadlock at that point. This patch changes the trace processing to call ceph_get_inode outside of the s_mutex and snap_rwsem, which should break the cycle above. [ idryomov: break unnecessarily long lines ] URL: https://tracker.ceph.com/issues/47998 Reported-by: Geng Jichao <gengjichao@jd.com> Signed-off-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Luis Henriques <lhenriques@suse.de> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
author: Jeff Layton <jlayton@kernel.org> 2020-11-12 10:03:38 -0500
committer: Ilya Dryomov <idryomov@gmail.com> 2020-12-14 23:21:48 +0100
commit: bca9fc14c70fcbbebc84954cc39994e463fb9468 (patch)
tree: b9ccd18bff3ea1a634d2cfaf552b4acdfb0aec60 /fs/ceph
parent: 6646ea1c8e8716ab6b8b60ff4930f808442cfe12 (diff)
download: lwn-bca9fc14c70fcbbebc84954cc39994e463fb9468.tar.gz
lwn-bca9fc14c70fcbbebc84954cc39994e463fb9468.zip
2 files changed, 21 insertions, 9 deletions
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ada906280e3a..11ccdee4ed6a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1315,15 +1315,10 @@ retry_lookup:
 	}
 
 	if (rinfo->head->is_target) {
-		tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-		tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-
-		in = ceph_get_inode(sb, tvino);
-		if (IS_ERR(in)) {
-			err = PTR_ERR(in);
-			goto done;
-		}
+		/* Should be filled in by handle_reply */
+		BUG_ON(!req->r_target_inode);
 
+		in = req->r_target_inode;
 		err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
 				NULL, session,
 				(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
@@ -1333,13 +1328,13 @@ retry_lookup:
 		if (err < 0) {
 			pr_err("ceph_fill_inode badness %p %llx.%llx\n",
 				in, ceph_vinop(in));
+			req->r_target_inode = NULL;
 			if (in->i_state & I_NEW)
 				discard_new_inode(in);
 			else
 				iput(in);
 			goto done;
 		}
-		req->r_target_inode = in;
 		if (in->i_state & I_NEW)
 			unlock_new_inode(in);
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b3c941503f8c..81eaf5212eee 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3179,6 +3179,23 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
 	mutex_unlock(&mdsc->mutex);
 
+	/* Must find target inode outside of mutexes to avoid deadlocks */
+	if ((err >= 0) && rinfo->head->is_target) {
+		struct inode *in;
+		struct ceph_vino tvino = {
+			.ino  = le64_to_cpu(rinfo->targeti.in->ino),
+			.snap = le64_to_cpu(rinfo->targeti.in->snapid)
+		};
+
+		in = ceph_get_inode(mdsc->fsc->sb, tvino);
+		if (IS_ERR(in)) {
+			err = PTR_ERR(in);
+			mutex_lock(&session->s_mutex);
+			goto out_err;
+		}
+		req->r_target_inode = in;
+	}
+
 	mutex_lock(&session->s_mutex);
 	if (err < 0) {
 		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
author	Jeff Layton <jlayton@kernel.org>	2020-11-12 10:03:38 -0500
committer	Ilya Dryomov <idryomov@gmail.com>	2020-12-14 23:21:48 +0100
commit	bca9fc14c70fcbbebc84954cc39994e463fb9468 (patch)
tree	b9ccd18bff3ea1a634d2cfaf552b4acdfb0aec60 /fs/ceph
parent	6646ea1c8e8716ab6b8b60ff4930f808442cfe12 (diff)
download	lwn-bca9fc14c70fcbbebc84954cc39994e463fb9468.tar.gz lwn-bca9fc14c70fcbbebc84954cc39994e463fb9468.zip