From 1f6b821aef78e3d79e8d598ae59fc7e23fb6c563 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 14 Nov 2018 12:24:01 +0100 Subject: libceph: drop last_piece logic from write_partial_message_data() last_piece is for the last piece in the current data item, not in the entire data payload of the message. This is harmful for messages with multiple data items. On top of that, we don't need to signal the end of a data payload either because it is always followed by a footer. We used to signal "more" unconditionally, until commit fe38a2b67bc6 ("libceph: start defining message data cursor"). Part of a large series, it introduced cursor->last_piece and also mistakenly inverted the hint by passing last_piece for "more". This was corrected with commit c2cfa1940097 ("libceph: Fix ceph_tcp_sendpage()'s more boolean usage"). As it is, last_piece is not helping at all: because Nagle algorithm is disabled, for a simple message with two 512-byte data items we end up emitting three packets: front + first data item, second data item and footer. Go back to the original pre-fe38a2b67bc6 behavior -- a single packet in most cases. Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2f126eff275d..cca96d32ac64 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1592,7 +1592,6 @@ static int write_partial_message_data(struct ceph_connection *con) struct page *page; size_t page_offset; size_t length; - bool last_piece; int ret; if (!cursor->resid) { @@ -1600,10 +1599,9 @@ static int write_partial_message_data(struct ceph_connection *con) continue; } - page = ceph_msg_data_next(cursor, &page_offset, &length, - &last_piece); - ret = ceph_tcp_sendpage(con->sock, page, page_offset, - length, !last_piece); + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, + true); if (ret <= 0) { if (do_datacrc) msg->footer.data_crc = cpu_to_le32(crc); -- cgit v1.2.3 From 3239eb5215ebdef593a79316c9dbbdf8849166ec Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 16 Nov 2018 11:58:19 +0100 Subject: libceph: use sock_no_sendpage() as a fallback in ceph_tcp_sendpage() sock_no_sendpage() makes the code cleaner. Also, don't set MSG_EOR. sendpage doesn't act on MSG_EOR on its own, it just honors the setting from the preceding sendmsg call by looking at ->eor in tcp_skb_can_collapse_to(). Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index cca96d32ac64..21a743a3bd29 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -560,24 +560,12 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, return r; } -static int __ceph_tcp_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, bool more) -{ - int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); - int ret; - - ret = kernel_sendpage(sock, page, offset, size, flags); - if (ret == -EAGAIN) - ret = 0; - - return ret; -} - static int ceph_tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, bool more) { - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - struct bio_vec bvec; + ssize_t (*sendpage)(struct socket *sock, struct page *page, + int offset, size_t size, int flags); + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : 0); int ret; /* @@ -589,19 +577,11 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, * triggers one of hardened usercopy checks. */ if (page_count(page) >= 1 && !PageSlab(page)) - return __ceph_tcp_sendpage(sock, page, offset, size, more); - - bvec.bv_page = page; - bvec.bv_offset = offset; - bvec.bv_len = size; - - if (more) - msg.msg_flags |= MSG_MORE; + sendpage = sock->ops->sendpage; else - msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ + sendpage = sock_no_sendpage; - iov_iter_bvec(&msg.msg_iter, WRITE, &bvec, 1, size); - ret = sock_sendmsg(sock, &msg); + ret = sendpage(sock, page, offset, size, flags); if (ret == -EAGAIN) ret = 0; -- cgit v1.2.3 From 433b0a12953bc1dfcb52febb186136395a65aad0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 20 Nov 2018 15:44:00 +0100 Subject: libceph: use MSG_SENDPAGE_NOTLAST with ceph_tcp_sendpage() Prevent do_tcp_sendpages() from calling tcp_push() (at least) once per page. Instead, arrange for tcp_push() to be called (at least) once per data payload. This results in more MSS-sized packets and fewer packets overall (5-10% reduction in my tests with typical OSD request sizes). See commits 2f5338442425 ("tcp: allow splice() to build full TSO packets"), 35f9c09fe9c7 ("tcp: tcp_sendpages() should call tcp_push() once") and ae62ca7b0321 ("tcp: fix MSG_SENDPAGE_NOTLAST logic") for details. Here is an example of a packet size histogram for 128K OSD requests (MSS = 1448, top 5): Before: SIZE COUNT 1448 777700 952 127915 1200 39238 1219 9806 21 5675 After: SIZE COUNT 1448 897280 21 6201 1019 2797 643 2739 376 2479 We could do slightly better by explicitly corking the socket but it's not clear it's worth it. Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 21a743a3bd29..649faa626b35 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -560,12 +560,15 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, return r; } +/* + * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST + */ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, bool more) + int offset, size_t size, int more) { ssize_t (*sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags); - int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : 0); + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; int ret; /* @@ -1552,6 +1555,7 @@ static int write_partial_message_data(struct ceph_connection *con) struct ceph_msg *msg = con->out_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; u32 crc; dout("%s %p msg %p\n", __func__, con, msg); @@ -1580,8 +1584,10 @@ static int write_partial_message_data(struct ceph_connection *con) } page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + if (length == cursor->total_resid) + more = MSG_MORE; ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, - true); + more); if (ret <= 0) { if (do_datacrc) msg->footer.data_crc = cpu_to_le32(crc); @@ -1611,13 +1617,16 @@ static int write_partial_message_data(struct ceph_connection *con) */ static int write_partial_skip(struct ceph_connection *con) { + int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; int ret; dout("%s %p %d left\n", __func__, con, con->out_skip); while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_SIZE); - ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); + if (size == con->out_skip) + more = MSG_MORE; + ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more); if (ret <= 0) goto out; con->out_skip -= ret; -- cgit v1.2.3 From 87349cdad963163b55cf7d327f5d47a647339838 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 21 Nov 2018 18:56:40 +0100 Subject: libceph: switch more to bool in ceph_tcp_sendmsg() Unlike in ceph_tcp_sendpage(), it's a bool. Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 649faa626b35..d5718284db57 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -544,7 +544,7 @@ static int ceph_tcp_recvpage(struct socket *sock, struct page *page, * shortly. */ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, - size_t kvlen, size_t len, int more) + size_t kvlen, size_t len, bool more) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; int r; -- cgit v1.2.3