summaryrefslogtreecommitdiff
path: root/include/linux/sunrpc
diff options
context:
space:
mode:
authorChuck Lever <chuck.lever@oracle.com>2026-02-26 09:47:35 -0500
committerChuck Lever <chuck.lever@oracle.com>2026-03-29 21:25:09 -0400
commitee66b9e3e1c69efc986f3932555f07121c3460a7 (patch)
tree445ca8c0ed52f3d1b7608334bbb1d2965b94c19e /include/linux/sunrpc
parent46ca8dd2441ffa49a7f31b9070f972f52c5779c3 (diff)
downloadlwn-ee66b9e3e1c69efc986f3932555f07121c3460a7.tar.gz
lwn-ee66b9e3e1c69efc986f3932555f07121c3460a7.zip
SUNRPC: Allocate a separate Reply page array
struct svc_rqst uses a single dynamically-allocated page array (rq_pages) for both the incoming RPC Call message and the outgoing RPC Reply message. rq_respages is a sliding pointer into rq_pages that each transport receive path must compute based on how many pages the Call consumed. This boundary tracking is a source of confusion and bugs, and prevents an RPC transaction from having both a large Call and a large Reply simultaneously. Allocate rq_respages as its own page array, eliminating the boundary arithmetic. This decouples Call and Reply buffer lifetimes, following the precedent set by rq_bvec (a separate dynamically- allocated array for I/O vectors). Each svc_rqst now pins twice as many pages as before. For a server running 16 threads with a 1MB maximum payload, the additional cost is roughly 16MB of pinned memory. The new dynamic svc thread count facility keeps this overhead minimal on an idle server. A subsequent patch in this series limits per-request repopulation to only the pages released during the previous RPC, avoiding a full-array scan on each call to svc_alloc_arg(). Note: We've considered several alternatives to maintaining a full second array. Each alternative reintroduces either boundary logic complexity or I/O-path allocation pressure. rq_next_page is initialized in svc_alloc_arg() and svc_process() during Reply construction, and in svc_rdma_recvfrom() as a precaution on error paths. Transport receive paths no longer compute it from the Call size. Reviewed-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Diffstat (limited to 'include/linux/sunrpc')
-rw-r--r--include/linux/sunrpc/svc.h47
1 files changed, 23 insertions, 24 deletions
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 62152e4f3bcc..3b1a98ab5cba 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -134,25 +134,24 @@ enum {
extern u32 svc_max_payload(const struct svc_rqst *rqstp);
/*
- * RPC Requests and replies are stored in one or more pages.
- * We maintain an array of pages for each server thread.
- * Requests are copied into these pages as they arrive. Remaining
- * pages are available to write the reply into.
+ * RPC Call and Reply messages each have their own page array.
+ * rq_pages holds the incoming Call message; rq_respages holds
+ * the outgoing Reply message. Both arrays are sized to
+ * svc_serv_maxpages() entries and are allocated dynamically.
*
- * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each server thread
- * needs to allocate more to replace those used in sending. To help keep track
- * of these pages we have a receive list where all pages initialy live, and a
- * send list where pages are moved to when there are to be part of a reply.
+ * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each
+ * server thread needs to allocate more to replace those used in
+ * sending.
*
- * We use xdr_buf for holding responses as it fits well with NFS
- * read responses (that have a header, and some data pages, and possibly
- * a tail) and means we can share some client side routines.
+ * xdr_buf holds responses; the structure fits NFS read responses
+ * (header, data pages, optional tail) and enables sharing of
+ * client-side routines.
*
- * The xdr_buf.head kvec always points to the first page in the rq_*pages
- * list. The xdr_buf.pages pointer points to the second page on that
- * list. xdr_buf.tail points to the end of the first page.
- * This assumes that the non-page part of an rpc reply will fit
- * in a page - NFSd ensures this. lockd also has no trouble.
+ * The xdr_buf.head kvec always points to the first page in the
+ * rq_*pages list. The xdr_buf.pages pointer points to the second
+ * page on that list. xdr_buf.tail points to the end of the first
+ * page. This assumes that the non-page part of an rpc reply will
+ * fit in a page - NFSd ensures this. lockd also has no trouble.
*/
/**
@@ -162,10 +161,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
* Returns a count of pages or vectors that can hold the maximum
* size RPC message for @serv.
*
- * Each request/reply pair can have at most one "payload", plus two
- * pages, one for the request, and one for the reply.
- * nfsd_splice_actor() might need an extra page when a READ payload
- * is not page-aligned.
+ * Each page array can hold at most one payload plus two
+ * overhead pages (one for the RPC header, one for tail data).
+ * nfsd_splice_actor() might need an extra page when a READ
+ * payload is not page-aligned.
*/
static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv)
{
@@ -204,11 +203,11 @@ struct svc_rqst {
struct xdr_stream rq_res_stream;
struct folio *rq_scratch_folio;
struct xdr_buf rq_res;
- unsigned long rq_maxpages; /* num of entries in rq_pages */
- struct page * *rq_pages;
- struct page * *rq_respages; /* points into rq_pages */
+ unsigned long rq_maxpages; /* entries per page array */
+ struct page * *rq_pages; /* Call buffer pages */
+ struct page * *rq_respages; /* Reply buffer pages */
struct page * *rq_next_page; /* next reply page to use */
- struct page * *rq_page_end; /* one past the last page */
+ struct page * *rq_page_end; /* one past the last reply page */
struct folio_batch rq_fbatch;
struct bio_vec *rq_bvec;