diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-19 14:14:42 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-19 14:14:42 -0800 |
commit | 4533f6e27a366ecc3da4876074ebfe0cc0ea4f0f (patch) | |
tree | 8b6f1aeeda991e6a1ce98702d7cc35d2d2a444b1 /fs/ceph/mds_client.c | |
parent | 89d3fa45b4add00cd0056361a2498e978cb1e119 (diff) | |
parent | 0f5417cea6cfeafd5cdec4223df63ca79918fdea (diff) | |
download | lwn-4533f6e27a366ecc3da4876074ebfe0cc0ea4f0f.tar.gz lwn-4533f6e27a366ecc3da4876074ebfe0cc0ea4f0f.zip |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph changes from Sage Weil:
"On the RBD side, there is a conversion to blk-mq from Christoph,
several long-standing bug fixes from Ilya, and some cleanup from
Rickard Strandqvist.
On the CephFS side there is a long list of fixes from Zheng, including
improved session handling, a few IO path fixes, some dcache management
correctness fixes, and several blocking while !TASK_RUNNING fixes.
The core code gets a few cleanups and Chaitanya has added support for
TCP_NODELAY (which has been used on the server side for ages but we
somehow missed on the kernel client).
There is also an update to MAINTAINERS to fix up some email addresses
and reflect that Ilya and Zheng are doing most of the maintenance for
RBD and CephFS these days. Do not be surprised to see a pull request
come from one of them in the future if I am unavailable for some
reason"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (27 commits)
MAINTAINERS: update Ceph and RBD maintainers
libceph: kfree() in put_osd() shouldn't depend on authorizer
libceph: fix double __remove_osd() problem
rbd: convert to blk-mq
ceph: return error for traceless reply race
ceph: fix dentry leaks
ceph: re-send requests when MDS enters reconnecting stage
ceph: show nocephx_require_signatures and notcp_nodelay options
libceph: tcp_nodelay support
rbd: do not treat standalone as flatten
ceph: fix atomic_open snapdir
ceph: properly mark empty directory as complete
client: include kernel version in client metadata
ceph: provide seperate {inode,file}_operations for snapdir
ceph: fix request time stamp encoding
ceph: fix reading inline data when i_size > PAGE_SIZE
ceph: avoid block operation when !TASK_RUNNING (ceph_mdsc_close_sessions)
ceph: avoid block operation when !TASK_RUNNING (ceph_get_caps)
ceph: avoid block operation when !TASK_RUNNING (ceph_mdsc_sync)
rbd: fix error paths in rbd_dev_refresh()
...
Diffstat (limited to 'fs/ceph/mds_client.c')
-rw-r--r-- | fs/ceph/mds_client.c | 127 |
1 files changed, 93 insertions, 34 deletions
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5f62fb7a5d0a..71c073f38e54 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -480,6 +480,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, mdsc->max_sessions = newmax; } mdsc->sessions[mds] = s; + atomic_inc(&mdsc->num_sessions); atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, @@ -503,6 +504,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc, mdsc->sessions[s->s_mds] = NULL; ceph_con_close(&s->s_con); ceph_put_mds_session(s); + atomic_dec(&mdsc->num_sessions); } /* @@ -842,8 +844,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 struct ceph_options *opt = mdsc->fsc->client->options; void *p; - const char* metadata[3][2] = { + const char* metadata[][2] = { {"hostname", utsname()->nodename}, + {"kernel_version", utsname()->release}, {"entity_id", opt->name ? opt->name : ""}, {NULL, NULL} }; @@ -1464,19 +1467,33 @@ out_unlocked: return err; } +static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + spin_lock(&ci->i_ceph_lock); + if (ci->i_flushing_caps) + ret = ci->i_cap_flush_seq >= want_flush_seq; + else + ret = 1; + spin_unlock(&ci->i_ceph_lock); + return ret; +} + /* * flush all dirty inode data to disk. * * returns true if we've flushed through want_flush_seq */ -static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) { - int mds, ret = 1; + int mds; dout("check_cap_flush want %lld\n", want_flush_seq); mutex_lock(&mdsc->mutex); - for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { + for (mds = 0; mds < mdsc->max_sessions; mds++) { struct ceph_mds_session *session = mdsc->sessions[mds]; + struct inode *inode = NULL; if (!session) continue; @@ -1489,29 +1506,29 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) list_entry(session->s_cap_flushing.next, struct ceph_inode_info, i_flushing_item); - struct inode *inode = &ci->vfs_inode; - spin_lock(&ci->i_ceph_lock); - if (ci->i_cap_flush_seq <= want_flush_seq) { + if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", inode, - ci->i_cap_flush_seq, want_flush_seq, - session->s_mds); - ret = 0; + "seq %lld <= %lld to mds%d\n", + &ci->vfs_inode, ci->i_cap_flush_seq, + want_flush_seq, session->s_mds); + inode = igrab(&ci->vfs_inode); } - spin_unlock(&ci->i_ceph_lock); } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); - if (!ret) - return ret; + if (inode) { + wait_event(mdsc->cap_flushing_wq, + check_cap_flush(inode, want_flush_seq)); + iput(inode); + } + mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); - return ret; } /* @@ -1923,7 +1940,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->num_releases = cpu_to_le16(releases); /* time stamp */ - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; @@ -2012,7 +2033,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, /* time stamp */ p = msg->front.iov_base + req->r_request_release_offset; - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2159,6 +2184,8 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) p = rb_next(p); if (req->r_got_unsafe) continue; + if (req->r_attempts > 0) + continue; /* only new requests */ if (req->r_session && req->r_session->s_mds == mds) { dout(" kicking tid %llu\n", req->r_tid); @@ -2286,6 +2313,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) struct ceph_mds_request *req; struct ceph_mds_reply_head *head = msg->front.iov_base; struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ + struct ceph_snap_realm *realm; u64 tid; int err, result; int mds = session->s_mds; @@ -2401,11 +2429,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* snap trace */ + realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); ceph_update_snap_trace(mdsc, rinfo->snapblob, - rinfo->snapblob + rinfo->snapblob_len, - le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); + rinfo->snapblob + rinfo->snapblob_len, + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, + &realm); downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -2423,6 +2453,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); + if (realm) + ceph_put_snap_realm(mdsc, realm); out_err: mutex_lock(&mdsc->mutex); if (!req->r_aborted) { @@ -2487,6 +2519,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); BUG_ON(req->r_got_result); + req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; put_request_session(req); @@ -2580,6 +2613,14 @@ static void handle_session(struct ceph_mds_session *session, send_flushmsg_ack(mdsc, session, seq); break; + case CEPH_SESSION_FORCE_RO: + dout("force_session_readonly %p\n", session); + spin_lock(&session->s_cap_lock); + session->s_readonly = true; + spin_unlock(&session->s_cap_lock); + wake_up_session_caps(session, 0); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); @@ -2610,6 +2651,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_request *req, *nreq; + struct rb_node *p; int err; dout("replay_unsafe_requests mds%d\n", session->s_mds); @@ -2622,6 +2664,28 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, ceph_con_send(&session->s_con, req->r_request); } } + + /* + * also re-send old requests when MDS enters reconnect stage. So that MDS + * can process completed request in clientreplay stage. + */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_got_unsafe) + continue; + if (req->r_attempts == 0) + continue; /* only old requests */ + if (req->r_session && + req->r_session->s_mds == session->s_mds) { + err = __prepare_send_request(mdsc, req, session->s_mds); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + } + } mutex_unlock(&mdsc->mutex); } @@ -2787,6 +2851,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, spin_unlock(&session->s_gen_ttl_lock); spin_lock(&session->s_cap_lock); + /* don't know if session is readonly */ + session->s_readonly = 0; /* * notify __ceph_remove_cap() that we are composing cap reconnect. * If a cap get released before being added to the cap reconnect, @@ -2933,9 +2999,6 @@ static void check_new_map(struct ceph_mds_client *mdsc, mutex_unlock(&s->s_mutex); s->s_state = CEPH_MDS_SESSION_RESTARTING; } - - /* kick any requests waiting on the recovering mds */ - kick_requests(mdsc, i); } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } @@ -3295,6 +3358,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->sessions = NULL; + atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; init_rwsem(&mdsc->snap_rwsem); @@ -3428,14 +3492,17 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) dout("sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; - want_flush = mdsc->cap_flush_seq; mutex_unlock(&mdsc->mutex); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); ceph_flush_dirty_caps(mdsc); + spin_lock(&mdsc->cap_dirty_lock); + want_flush = mdsc->cap_flush_seq; + spin_unlock(&mdsc->cap_dirty_lock); + + dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); wait_unsafe_requests(mdsc, want_tid); - wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); + wait_caps_flush(mdsc, want_flush); } /* @@ -3443,17 +3510,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) */ static bool done_closing_sessions(struct ceph_mds_client *mdsc) { - int i, n = 0; - if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return true; - - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) - if (mdsc->sessions[i]) - n++; - mutex_unlock(&mdsc->mutex); - return n == 0; + return atomic_read(&mdsc->num_sessions) == 0; } /* |