summaryrefslogtreecommitdiff
path: root/net/sunrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/Kconfig14
-rw-r--r--net/sunrpc/auth.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c91
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c148
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_internal.h7
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_test.c93
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c2
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c2
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c94
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c18
-rw-r--r--net/sunrpc/auth_unix.c2
-rw-r--r--net/sunrpc/backchannel_rqst.c40
-rw-r--r--net/sunrpc/cache.c277
-rw-r--r--net/sunrpc/clnt.c88
-rw-r--r--net/sunrpc/rpc_pipe.c539
-rw-r--r--net/sunrpc/rpcb_clnt.c7
-rw-r--r--net/sunrpc/sched.c2
-rw-r--r--net/sunrpc/socklib.c166
-rw-r--r--net/sunrpc/stats.c2
-rw-r--r--net/sunrpc/svc.c422
-rw-r--r--net/sunrpc/svc_xprt.c129
-rw-r--r--net/sunrpc/svcauth_unix.c6
-rw-r--r--net/sunrpc/svcsock.c155
-rw-r--r--net/sunrpc/sysfs.c231
-rw-r--r--net/sunrpc/xdr.c124
-rw-r--r--net/sunrpc/xprt.c29
-rw-r--r--net/sunrpc/xprtmultipath.c23
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c8
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c179
-rw-r--r--net/sunrpc/xprtrdma/ib_client.c2
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c179
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_pcl.c2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c36
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c538
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c210
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c76
-rw-r--r--net/sunrpc/xprtrdma/transport.c17
-rw-r--r--net/sunrpc/xprtrdma/verbs.c38
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h43
-rw-r--r--net/sunrpc/xprtsock.c65
41 files changed, 2278 insertions, 1834 deletions
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 2d8b67dac7b5..a570e7adf270 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -101,6 +101,20 @@ config SUNRPC_DEBUG
If unsure, say Y.
+config SUNRPC_DEBUG_TRACE
+ bool "RPC: Send dfprintk() output to the trace buffer"
+ depends on SUNRPC_DEBUG && TRACING
+ default n
+ help
+ dprintk() output can be voluminous, which can overwhelm the
+ kernel's logging facility as it must be sent to the console.
+ This option causes dprintk() output to go to the trace buffer
+ instead of the kernel log.
+
+ This will cause warnings about trace_printk() being used to be
+ logged at boot time, so say N unless you are debugging a problem
+ with sunrpc-based clients or services.
+
config SUNRPC_XPRT_RDMA
tristate "RPC-over-RDMA transport"
depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 04534ea537c8..68c0595ea2fd 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -290,12 +290,12 @@ rpcauth_init_credcache(struct rpc_auth *auth)
struct rpc_cred_cache *new;
unsigned int hashsize;
- new = kmalloc(sizeof(*new), GFP_KERNEL);
+ new = kmalloc_obj(*new);
if (!new)
goto out_nocache;
new->hashbits = auth_hashbits;
hashsize = 1U << new->hashbits;
- new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL);
+ new->hashtable = kzalloc_objs(new->hashtable[0], hashsize);
if (!new->hashtable)
goto out_nohashtbl;
spin_lock_init(&new->lock);
@@ -489,7 +489,7 @@ static unsigned long
rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- return number_cred_unused * sysctl_vfs_cache_pressure / 100;
+ return number_cred_unused;
}
static void
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 369310909fc9..9d3fb6848f40 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -39,6 +39,8 @@ static const struct rpc_authops authgss_ops;
static const struct rpc_credops gss_credops;
static const struct rpc_credops gss_nullops;
+static void gss_free_callback(struct kref *kref);
+
#define GSS_RETRY_EXPIRED 5
static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED;
@@ -162,7 +164,7 @@ gss_alloc_context(void)
{
struct gss_cl_ctx *ctx;
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ ctx = kzalloc_obj(*ctx);
if (ctx != NULL) {
ctx->gc_proc = RPC_GSS_PROC_DATA;
ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */
@@ -527,7 +529,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
int vers;
int err = -ENOMEM;
- gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL);
+ gss_msg = kzalloc_obj(*gss_msg);
if (gss_msg == NULL)
goto err;
vers = get_pipe_version(gss_auth->net);
@@ -551,6 +553,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
}
return gss_msg;
err_put_pipe_version:
+ kref_put(&gss_auth->kref, gss_free_callback);
put_pipe_version(gss_auth->net);
err_free_msg:
kfree(gss_msg);
@@ -887,25 +890,16 @@ static void gss_pipe_dentry_destroy(struct dentry *dir,
struct rpc_pipe_dir_object *pdo)
{
struct gss_pipe *gss_pipe = pdo->pdo_data;
- struct rpc_pipe *pipe = gss_pipe->pipe;
- if (pipe->dentry != NULL) {
- rpc_unlink(pipe->dentry);
- pipe->dentry = NULL;
- }
+ rpc_unlink(gss_pipe->pipe);
}
static int gss_pipe_dentry_create(struct dentry *dir,
struct rpc_pipe_dir_object *pdo)
{
struct gss_pipe *p = pdo->pdo_data;
- struct dentry *dentry;
- dentry = rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- p->pipe->dentry = dentry;
- return 0;
+ return rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe);
}
static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = {
@@ -920,7 +914,7 @@ static struct gss_pipe *gss_pipe_alloc(struct rpc_clnt *clnt,
struct gss_pipe *p;
int err = -ENOMEM;
- p = kmalloc(sizeof(*p), GFP_KERNEL);
+ p = kmalloc_obj(*p);
if (p == NULL)
goto err;
p->pipe = rpc_mkpipe_data(upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
@@ -1035,7 +1029,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
if (!try_module_get(THIS_MODULE))
return ERR_PTR(err);
- if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
+ if (!(gss_auth = kmalloc_obj(*gss_auth)))
goto out_dec;
INIT_HLIST_NODE(&gss_auth->hash);
gss_auth->target_name = NULL;
@@ -1252,7 +1246,7 @@ gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
struct gss_cred *new;
/* Make a copy of the cred so that we can reference count it */
- new = kzalloc(sizeof(*gss_cred), GFP_KERNEL);
+ new = kzalloc_obj(*gss_cred);
if (new) {
struct auth_cred acred = {
.cred = gss_cred->gc_base.cr_cred,
@@ -1386,7 +1380,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
struct gss_cred *cred = NULL;
int err = -ENOMEM;
- if (!(cred = kzalloc(sizeof(*cred), gfp)))
+ if (!(cred = kzalloc_obj(*cred, gfp)))
goto out_err;
rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
@@ -1545,6 +1539,7 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr)
struct kvec iov;
struct xdr_buf verf_buf;
int status;
+ u32 seqno;
/* Credential */
@@ -1556,15 +1551,16 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr)
cred_len = p++;
spin_lock(&ctx->gc_seq_lock);
- req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ;
+ seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ;
+ xprt_rqst_add_seqno(req, seqno);
spin_unlock(&ctx->gc_seq_lock);
- if (req->rq_seqno == MAXSEQ)
+ if (*req->rq_seqnos == MAXSEQ)
goto expired;
trace_rpcgss_seqno(task);
*p++ = cpu_to_be32(RPC_GSS_VERSION);
*p++ = cpu_to_be32(ctx->gc_proc);
- *p++ = cpu_to_be32(req->rq_seqno);
+ *p++ = cpu_to_be32(*req->rq_seqnos);
*p++ = cpu_to_be32(gss_cred->gc_service);
p = xdr_encode_netobj(p, &ctx->gc_wire_ctx);
*cred_len = cpu_to_be32((p - (cred_len + 1)) << 2);
@@ -1678,17 +1674,31 @@ gss_refresh_null(struct rpc_task *task)
return 0;
}
+static u32
+gss_validate_seqno_mic(struct gss_cl_ctx *ctx, u32 seqno, __be32 *seq, __be32 *p, u32 len)
+{
+ struct kvec iov;
+ struct xdr_buf verf_buf;
+ struct xdr_netobj mic;
+
+ *seq = cpu_to_be32(seqno);
+ iov.iov_base = seq;
+ iov.iov_len = 4;
+ xdr_buf_from_iov(&iov, &verf_buf);
+ mic.data = (u8 *)p;
+ mic.len = len;
+ return gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
+}
+
static int
gss_validate(struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_cred *cred = task->tk_rqstp->rq_cred;
struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
__be32 *p, *seq = NULL;
- struct kvec iov;
- struct xdr_buf verf_buf;
- struct xdr_netobj mic;
u32 len, maj_stat;
int status;
+ int i = 1; /* don't recheck the first item */
p = xdr_inline_decode(xdr, 2 * sizeof(*p));
if (!p)
@@ -1705,13 +1715,10 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr)
seq = kmalloc(4, GFP_KERNEL);
if (!seq)
goto validate_failed;
- *seq = cpu_to_be32(task->tk_rqstp->rq_seqno);
- iov.iov_base = seq;
- iov.iov_len = 4;
- xdr_buf_from_iov(&iov, &verf_buf);
- mic.data = (u8 *)p;
- mic.len = len;
- maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
+ maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[0], seq, p, len);
+ /* RFC 2203 5.3.3.1 - compute the checksum of each sequence number in the cache */
+ while (unlikely(maj_stat == GSS_S_BAD_SIG && i < task->tk_rqstp->rq_seqno_count))
+ maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[i++], seq, p, len);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
if (maj_stat)
@@ -1750,7 +1757,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
if (!p)
goto wrap_failed;
integ_len = p++;
- *p = cpu_to_be32(rqstp->rq_seqno);
+ *p = cpu_to_be32(*rqstp->rq_seqnos);
if (rpcauth_wrap_req_encode(task, xdr))
goto wrap_failed;
@@ -1810,9 +1817,7 @@ alloc_enc_pages(struct rpc_rqst *rqstp)
last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT;
rqstp->rq_enc_pages_num = last - first + 1 + 1;
rqstp->rq_enc_pages
- = kmalloc_array(rqstp->rq_enc_pages_num,
- sizeof(struct page *),
- GFP_KERNEL);
+ = kmalloc_objs(struct page *, rqstp->rq_enc_pages_num);
if (!rqstp->rq_enc_pages)
goto out;
for (i=0; i < rqstp->rq_enc_pages_num; i++) {
@@ -1847,7 +1852,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
if (!p)
goto wrap_failed;
opaque_len = p++;
- *p = cpu_to_be32(rqstp->rq_seqno);
+ *p = cpu_to_be32(*rqstp->rq_seqnos);
if (rpcauth_wrap_req_encode(task, xdr))
goto wrap_failed;
@@ -2001,7 +2006,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
offset = rcv_buf->len - xdr_stream_remaining(xdr);
if (xdr_stream_decode_u32(xdr, &seqno))
goto unwrap_failed;
- if (seqno != rqstp->rq_seqno)
+ if (seqno != *rqstp->rq_seqnos)
goto bad_seqno;
if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len))
goto unwrap_failed;
@@ -2045,7 +2050,7 @@ unwrap_failed:
trace_rpcgss_unwrap_failed(task);
goto out;
bad_seqno:
- trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno);
+ trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, seqno);
goto out;
bad_mic:
trace_rpcgss_verify_mic(task, maj_stat);
@@ -2077,7 +2082,7 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
if (maj_stat != GSS_S_COMPLETE)
goto bad_unwrap;
/* gss_unwrap decrypted the sequence number */
- if (be32_to_cpup(p++) != rqstp->rq_seqno)
+ if (be32_to_cpup(p++) != *rqstp->rq_seqnos)
goto bad_seqno;
/* gss_unwrap redacts the opaque blob from the head iovec.
@@ -2093,7 +2098,7 @@ unwrap_failed:
trace_rpcgss_unwrap_failed(task);
return -EIO;
bad_seqno:
- trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p));
+ trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, be32_to_cpup(--p));
return -EIO;
bad_unwrap:
trace_rpcgss_unwrap(task, maj_stat);
@@ -2118,14 +2123,14 @@ gss_xmit_need_reencode(struct rpc_task *task)
if (!ctx)
goto out;
- if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq)))
+ if (gss_seq_is_newer(*req->rq_seqnos, READ_ONCE(ctx->gc_seq)))
goto out_ctx;
seq_xmit = READ_ONCE(ctx->gc_seq_xmit);
- while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) {
+ while (gss_seq_is_newer(*req->rq_seqnos, seq_xmit)) {
u32 tmp = seq_xmit;
- seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno);
+ seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, *req->rq_seqnos);
if (seq_xmit == tmp) {
ret = false;
goto out_ctx;
@@ -2134,7 +2139,7 @@ gss_xmit_need_reencode(struct rpc_task *task)
win = ctx->gc_win;
if (win > 0)
- ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win);
+ ret = !gss_seq_is_newer(*req->rq_seqnos, seq_xmit - win);
out_ctx:
gss_put_ctx(ctx);
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 9a27201638e2..16dcf115de1e 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -138,60 +138,6 @@ out:
return ret;
}
-/**
- * krb5_decrypt - simple decryption of an RPCSEC GSS payload
- * @tfm: initialized cipher transform
- * @iv: pointer to an IV
- * @in: ciphertext to decrypt
- * @out: OUT: plaintext
- * @length: length of input and output buffers, in bytes
- *
- * @iv may be NULL to force the use of an all-zero IV.
- * The buffer containing the IV must be as large as the
- * cipher's ivsize.
- *
- * Return values:
- * %0: @in successfully decrypted into @out
- * negative errno: @in not decrypted
- */
-u32
-krb5_decrypt(
- struct crypto_sync_skcipher *tfm,
- void * iv,
- void * in,
- void * out,
- int length)
-{
- u32 ret = -EINVAL;
- struct scatterlist sg[1];
- u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-
- if (length % crypto_sync_skcipher_blocksize(tfm) != 0)
- goto out;
-
- if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
- dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n",
- crypto_sync_skcipher_ivsize(tfm));
- goto out;
- }
- if (iv)
- memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm));
-
- memcpy(out, in, length);
- sg_init_one(sg, out, length);
-
- skcipher_request_set_sync_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, length, local_iv);
-
- ret = crypto_skcipher_decrypt(req);
- skcipher_request_zero(req);
-out:
- dprintk("RPC: gss_k5decrypt returns %d\n",ret);
- return ret;
-}
-
static int
checksummer(struct scatterlist *sg, void *data)
{
@@ -202,96 +148,6 @@ checksummer(struct scatterlist *sg, void *data)
return crypto_ahash_update(req);
}
-/*
- * checksum the plaintext data and hdrlen bytes of the token header
- * The checksum is performed over the first 8 bytes of the
- * gss token header and then over the data body
- */
-u32
-make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
- struct xdr_buf *body, int body_offset, u8 *cksumkey,
- unsigned int usage, struct xdr_netobj *cksumout)
-{
- struct crypto_ahash *tfm;
- struct ahash_request *req;
- struct scatterlist sg[1];
- int err = -1;
- u8 *checksumdata;
- unsigned int checksumlen;
-
- if (cksumout->len < kctx->gk5e->cksumlength) {
- dprintk("%s: checksum buffer length, %u, too small for %s\n",
- __func__, cksumout->len, kctx->gk5e->name);
- return GSS_S_FAILURE;
- }
-
- checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL);
- if (checksumdata == NULL)
- return GSS_S_FAILURE;
-
- tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- goto out_free_cksum;
-
- req = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!req)
- goto out_free_ahash;
-
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
-
- checksumlen = crypto_ahash_digestsize(tfm);
-
- if (cksumkey != NULL) {
- err = crypto_ahash_setkey(tfm, cksumkey,
- kctx->gk5e->keylength);
- if (err)
- goto out;
- }
-
- err = crypto_ahash_init(req);
- if (err)
- goto out;
- sg_init_one(sg, header, hdrlen);
- ahash_request_set_crypt(req, sg, NULL, hdrlen);
- err = crypto_ahash_update(req);
- if (err)
- goto out;
- err = xdr_process_buf(body, body_offset, body->len - body_offset,
- checksummer, req);
- if (err)
- goto out;
- ahash_request_set_crypt(req, NULL, checksumdata, 0);
- err = crypto_ahash_final(req);
- if (err)
- goto out;
-
- switch (kctx->gk5e->ctype) {
- case CKSUMTYPE_RSA_MD5:
- err = krb5_encrypt(kctx->seq, NULL, checksumdata,
- checksumdata, checksumlen);
- if (err)
- goto out;
- memcpy(cksumout->data,
- checksumdata + checksumlen - kctx->gk5e->cksumlength,
- kctx->gk5e->cksumlength);
- break;
- case CKSUMTYPE_HMAC_SHA1_DES3:
- memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
- break;
- default:
- BUG();
- break;
- }
- cksumout->len = kctx->gk5e->cksumlength;
-out:
- ahash_request_free(req);
-out_free_ahash:
- crypto_free_ahash(tfm);
-out_free_cksum:
- kfree(checksumdata);
- return err ? GSS_S_FAILURE : 0;
-}
-
/**
* gss_krb5_checksum - Compute the MAC for a GSS Wrap or MIC token
* @tfm: an initialized hash transform
@@ -1019,8 +875,8 @@ out_err:
* krb5_etm_decrypt - Decrypt using the RFC 8009 rules
* @kctx: Kerberos context
* @offset: starting offset of the ciphertext, in bytes
- * @len:
- * @buf:
+ * @len: size of ciphertext to unwrap
+ * @buf: ciphertext to unwrap
* @headskip: OUT: the enctype's confounder length, in octets
* @tailskip: OUT: the enctype's HMAC length, in octets
*
diff --git a/net/sunrpc/auth_gss/gss_krb5_internal.h b/net/sunrpc/auth_gss/gss_krb5_internal.h
index a47e9ec228a5..8769e9e705bf 100644
--- a/net/sunrpc/auth_gss/gss_krb5_internal.h
+++ b/net/sunrpc/auth_gss/gss_krb5_internal.h
@@ -155,10 +155,6 @@ static inline int krb5_derive_key(struct krb5_ctx *kctx,
void krb5_make_confounder(u8 *p, int conflen);
-u32 make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
- struct xdr_buf *body, int body_offset, u8 *cksumkey,
- unsigned int usage, struct xdr_netobj *cksumout);
-
u32 gss_krb5_checksum(struct crypto_ahash *tfm, char *header, int hdrlen,
const struct xdr_buf *body, int body_offset,
struct xdr_netobj *cksumout);
@@ -166,9 +162,6 @@ u32 gss_krb5_checksum(struct crypto_ahash *tfm, char *header, int hdrlen,
u32 krb5_encrypt(struct crypto_sync_skcipher *key, void *iv, void *in,
void *out, int length);
-u32 krb5_decrypt(struct crypto_sync_skcipher *key, void *iv, void *in,
- void *out, int length);
-
int xdr_extend_head(struct xdr_buf *buf, unsigned int base,
unsigned int shiftlen);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 3366505bc669..6db64a9111a9 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -473,7 +473,7 @@ gss_krb5_import_sec_context(const void *p, size_t len, struct gss_ctx *ctx_id,
struct krb5_ctx *ctx;
int ret;
- ctx = kzalloc(sizeof(*ctx), gfp_mask);
+ ctx = kzalloc_obj(*ctx, gfp_mask);
if (ctx == NULL)
return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c
index a5bff02cd7ba..dde1ee934d0d 100644
--- a/net/sunrpc/auth_gss/gss_krb5_test.c
+++ b/net/sunrpc/auth_gss/gss_krb5_test.c
@@ -63,10 +63,11 @@ static void kdf_case(struct kunit *test)
KUNIT_ASSERT_EQ(test, err, 0);
/* Assert */
- KUNIT_EXPECT_EQ_MSG(test,
- memcmp(param->expected_result->data,
- derivedkey.data, derivedkey.len), 0,
- "key mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->expected_result->data,
+ derivedkey.data,
+ derivedkey.len,
+ "key mismatch");
}
static void checksum_case(struct kunit *test)
@@ -111,10 +112,11 @@ static void checksum_case(struct kunit *test)
KUNIT_ASSERT_EQ(test, err, 0);
/* Assert */
- KUNIT_EXPECT_EQ_MSG(test,
- memcmp(param->expected_result->data,
- checksum.data, checksum.len), 0,
- "checksum mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->expected_result->data,
+ checksum.data,
+ checksum.len,
+ "checksum mismatch");
crypto_free_ahash(tfm);
}
@@ -314,10 +316,11 @@ static void rfc3961_nfold_case(struct kunit *test)
param->expected_result->len * 8, result);
/* Assert */
- KUNIT_EXPECT_EQ_MSG(test,
- memcmp(param->expected_result->data,
- result, param->expected_result->len), 0,
- "result mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->expected_result->data,
+ result,
+ param->expected_result->len,
+ "result mismatch");
}
static struct kunit_case rfc3961_test_cases[] = {
@@ -569,14 +572,16 @@ static void rfc3962_encrypt_case(struct kunit *test)
KUNIT_EXPECT_EQ_MSG(test,
param->expected_result->len, buf.len,
"ciphertext length mismatch");
- KUNIT_EXPECT_EQ_MSG(test,
- memcmp(param->expected_result->data,
- text, param->expected_result->len), 0,
- "ciphertext mismatch");
- KUNIT_EXPECT_EQ_MSG(test,
- memcmp(param->next_iv->data, iv,
- param->next_iv->len), 0,
- "IV mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->expected_result->data,
+ text,
+ param->expected_result->len,
+ "ciphertext mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->next_iv->data,
+ iv,
+ param->next_iv->len,
+ "IV mismatch");
crypto_free_sync_skcipher(cts_tfm);
crypto_free_sync_skcipher(cbc_tfm);
@@ -1194,15 +1199,17 @@ static void rfc6803_encrypt_case(struct kunit *test)
KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len,
buf.len + checksum.len,
"ciphertext length mismatch");
- KUNIT_EXPECT_EQ_MSG(test,
- memcmp(param->expected_result->data,
- buf.head[0].iov_base, buf.len), 0,
- "encrypted result mismatch");
- KUNIT_EXPECT_EQ_MSG(test,
- memcmp(param->expected_result->data +
- (param->expected_result->len - checksum.len),
- checksum.data, checksum.len), 0,
- "HMAC mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->expected_result->data,
+ buf.head[0].iov_base,
+ buf.len,
+ "encrypted result mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->expected_result->data +
+ (param->expected_result->len - checksum.len),
+ checksum.data,
+ checksum.len,
+ "HMAC mismatch");
crypto_free_ahash(ahash_tfm);
crypto_free_sync_skcipher(cts_tfm);
@@ -1687,15 +1694,16 @@ static void rfc8009_encrypt_case(struct kunit *test)
KUNIT_EXPECT_EQ_MSG(test,
param->expected_result->len, buf.len,
"ciphertext length mismatch");
- KUNIT_EXPECT_EQ_MSG(test,
- memcmp(param->expected_result->data,
- buf.head[0].iov_base,
- param->expected_result->len), 0,
- "ciphertext mismatch");
- KUNIT_EXPECT_EQ_MSG(test, memcmp(param->expected_hmac->data,
- checksum.data,
- checksum.len), 0,
- "HMAC mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->expected_result->data,
+ buf.head[0].iov_base,
+ param->expected_result->len,
+ "ciphertext mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->expected_hmac->data,
+ checksum.data,
+ checksum.len,
+ "HMAC mismatch");
crypto_free_ahash(ahash_tfm);
crypto_free_sync_skcipher(cts_tfm);
@@ -1826,10 +1834,11 @@ static void encrypt_selftest_case(struct kunit *test)
KUNIT_EXPECT_EQ_MSG(test,
param->plaintext->len, buf.len,
"length mismatch");
- KUNIT_EXPECT_EQ_MSG(test,
- memcmp(param->plaintext->data,
- buf.head[0].iov_base, buf.len), 0,
- "plaintext mismatch");
+ KUNIT_EXPECT_MEMEQ_MSG(test,
+ param->plaintext->data,
+ buf.head[0].iov_base,
+ buf.len,
+ "plaintext mismatch");
crypto_free_sync_skcipher(cts_tfm);
crypto_free_sync_skcipher(cbc_tfm);
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index c84d0cf61980..78eab245f94a 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -355,7 +355,7 @@ gss_import_sec_context(const void *input_token, size_t bufsize,
time64_t *endtime,
gfp_t gfp_mask)
{
- if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask)))
+ if (!(*ctx_id = kzalloc_obj(**ctx_id, gfp_mask)))
return -ENOMEM;
(*ctx_id)->mech_type = gss_mech_get(mech);
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index f549e4c05def..0fa4778620d9 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -214,7 +214,7 @@ static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg)
unsigned int i;
arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE);
- arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL);
+ arg->pages = kzalloc_objs(struct page *, arg->npages);
if (!arg->pages)
return -ENOMEM;
for (i = 0; i < arg->npages; i++) {
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index cb32ab9a8395..fceee648d545 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -244,11 +244,11 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
/* we recognize only 1 currently: CREDS_VALUE */
oa->count = 1;
- oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL);
+ oa->data = kmalloc_obj(struct gssx_option);
if (!oa->data)
return -ENOMEM;
- creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL);
+ creds = kzalloc_obj(struct svc_cred);
if (!creds) {
err = -ENOMEM;
goto free_oa;
@@ -320,29 +320,47 @@ static int gssx_dec_status(struct xdr_stream *xdr,
/* status->minor_status */
p = xdr_inline_decode(xdr, 8);
- if (unlikely(p == NULL))
- return -ENOSPC;
+ if (unlikely(p == NULL)) {
+ err = -ENOSPC;
+ goto out_free_mech;
+ }
p = xdr_decode_hyper(p, &status->minor_status);
/* status->major_status_string */
err = gssx_dec_buffer(xdr, &status->major_status_string);
if (err)
- return err;
+ goto out_free_mech;
/* status->minor_status_string */
err = gssx_dec_buffer(xdr, &status->minor_status_string);
if (err)
- return err;
+ goto out_free_major_status_string;
/* status->server_ctx */
err = gssx_dec_buffer(xdr, &status->server_ctx);
if (err)
- return err;
+ goto out_free_minor_status_string;
/* we assume we have no options for now, so simply consume them */
/* status->options */
err = dummy_dec_opt_array(xdr, &status->options);
+ if (err)
+ goto out_free_server_ctx;
+ return 0;
+
+out_free_server_ctx:
+ kfree(status->server_ctx.data);
+ status->server_ctx.data = NULL;
+out_free_minor_status_string:
+ kfree(status->minor_status_string.data);
+ status->minor_status_string.data = NULL;
+out_free_major_status_string:
+ kfree(status->major_status_string.data);
+ status->major_status_string.data = NULL;
+out_free_mech:
+ kfree(status->mech.data);
+ status->mech.data = NULL;
return err;
}
@@ -505,28 +523,35 @@ static int gssx_dec_name(struct xdr_stream *xdr,
/* name->name_type */
err = gssx_dec_buffer(xdr, &dummy_netobj);
if (err)
- return err;
+ goto out_free_display_name;
/* name->exported_name */
err = gssx_dec_buffer(xdr, &dummy_netobj);
if (err)
- return err;
+ goto out_free_display_name;
/* name->exported_composite_name */
err = gssx_dec_buffer(xdr, &dummy_netobj);
if (err)
- return err;
+ goto out_free_display_name;
/* we assume we have no attributes for now, so simply consume them */
/* name->name_attributes */
err = dummy_dec_nameattr_array(xdr, &dummy_name_attr_array);
if (err)
- return err;
+ goto out_free_display_name;
/* we assume we have no options for now, so simply consume them */
/* name->extensions */
err = dummy_dec_opt_array(xdr, &dummy_option_array);
+ if (err)
+ goto out_free_display_name;
+ return 0;
+
+out_free_display_name:
+ kfree(name->display_name.data);
+ name->display_name.data = NULL;
return err;
}
@@ -649,32 +674,34 @@ static int gssx_dec_ctx(struct xdr_stream *xdr,
/* ctx->state */
err = gssx_dec_buffer(xdr, &ctx->state);
if (err)
- return err;
+ goto out_free_exported_context_token;
/* ctx->need_release */
err = gssx_dec_bool(xdr, &ctx->need_release);
if (err)
- return err;
+ goto out_free_state;
/* ctx->mech */
err = gssx_dec_buffer(xdr, &ctx->mech);
if (err)
- return err;
+ goto out_free_state;
/* ctx->src_name */
err = gssx_dec_name(xdr, &ctx->src_name);
if (err)
- return err;
+ goto out_free_mech;
/* ctx->targ_name */
err = gssx_dec_name(xdr, &ctx->targ_name);
if (err)
- return err;
+ goto out_free_src_name;
/* ctx->lifetime */
p = xdr_inline_decode(xdr, 8+8);
- if (unlikely(p == NULL))
- return -ENOSPC;
+ if (unlikely(p == NULL)) {
+ err = -ENOSPC;
+ goto out_free_targ_name;
+ }
p = xdr_decode_hyper(p, &ctx->lifetime);
/* ctx->ctx_flags */
@@ -683,17 +710,36 @@ static int gssx_dec_ctx(struct xdr_stream *xdr,
/* ctx->locally_initiated */
err = gssx_dec_bool(xdr, &ctx->locally_initiated);
if (err)
- return err;
+ goto out_free_targ_name;
/* ctx->open */
err = gssx_dec_bool(xdr, &ctx->open);
if (err)
- return err;
+ goto out_free_targ_name;
/* we assume we have no options for now, so simply consume them */
/* ctx->options */
err = dummy_dec_opt_array(xdr, &ctx->options);
+ if (err)
+ goto out_free_targ_name;
+
+ return 0;
+out_free_targ_name:
+ kfree(ctx->targ_name.display_name.data);
+ ctx->targ_name.display_name.data = NULL;
+out_free_src_name:
+ kfree(ctx->src_name.display_name.data);
+ ctx->src_name.display_name.data = NULL;
+out_free_mech:
+ kfree(ctx->mech.data);
+ ctx->mech.data = NULL;
+out_free_state:
+ kfree(ctx->state.data);
+ ctx->state.data = NULL;
+out_free_exported_context_token:
+ kfree(ctx->exported_context_token.data);
+ ctx->exported_context_token.data = NULL;
return err;
}
@@ -794,12 +840,12 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
struct gssx_res_accept_sec_context *res = data;
u32 value_follows;
int err;
- struct page *scratch;
+ struct folio *scratch;
- scratch = alloc_page(GFP_KERNEL);
+ scratch = folio_alloc(GFP_KERNEL, 0);
if (!scratch)
return -ENOMEM;
- xdr_set_scratch_page(xdr, scratch);
+ xdr_set_scratch_folio(xdr, scratch);
/* res->status */
err = gssx_dec_status(xdr, &res->status);
@@ -844,6 +890,6 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
err = gssx_dec_option_array(xdr, &res->options);
out_free:
- __free_page(scratch);
+ folio_put(scratch);
return err;
}
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 73a90ad873fb..161d02cc1c2c 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -197,7 +197,7 @@ static void update_rsi(struct cache_head *cnew, struct cache_head *citem)
static struct cache_head *rsi_alloc(void)
{
- struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL);
+ struct rsi *rsii = kmalloc_obj(*rsii);
if (rsii)
return &rsii->h;
else
@@ -449,7 +449,7 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
static struct cache_head *
rsc_alloc(void)
{
- struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL);
+ struct rsc *rsci = kmalloc_obj(*rsci);
if (rsci)
return &rsci->h;
else
@@ -724,7 +724,7 @@ svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
rqstp->rq_auth_stat = rpc_autherr_badverf;
return SVC_DENIED;
}
- if (flavor != RPC_AUTH_GSS) {
+ if (flavor != RPC_AUTH_GSS || checksum.len < XDR_UNIT) {
rqstp->rq_auth_stat = rpc_autherr_badverf;
return SVC_DENIED;
}
@@ -814,7 +814,7 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
struct auth_domain *test;
int stat = -ENOMEM;
- new = kmalloc(sizeof(*new), GFP_KERNEL);
+ new = kmalloc_obj(*new);
if (!new)
goto out;
kref_init(&new->h.ref);
@@ -1069,7 +1069,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp,
goto out_denied_free;
pages = DIV_ROUND_UP(inlen, PAGE_SIZE);
- in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
+ in_token->pages = kzalloc_objs(struct page *, pages + 1);
if (!in_token->pages)
goto out_denied_free;
in_token->page_base = 0;
@@ -1083,7 +1083,8 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp,
}
length = min_t(unsigned int, inlen, (char *)xdr->end - (char *)xdr->p);
- memcpy(page_address(in_token->pages[0]), xdr->p, length);
+ if (length)
+ memcpy(page_address(in_token->pages[0]), xdr->p, length);
inlen -= length;
to_offs = length;
@@ -1628,9 +1629,9 @@ svcauth_gss_accept(struct svc_rqst *rqstp)
int ret;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
- rqstp->rq_auth_stat = rpc_autherr_badcred;
+ rqstp->rq_auth_stat = rpc_autherr_failed;
if (!svcdata)
- svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL);
+ svcdata = kmalloc_obj(*svcdata);
if (!svcdata)
goto auth_err;
rqstp->rq_auth_data = svcdata;
@@ -1638,6 +1639,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp)
svcdata->rsci = NULL;
gc = &svcdata->clcred;
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
if (!svcauth_gss_decode_credbody(&rqstp->rq_arg_stream, gc, &rpcstart))
goto auth_err;
if (gc->gc_v != RPC_GSS_VERSION)
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 1e091d3fa607..6c742a3400c4 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -45,7 +45,7 @@ static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth,
{
struct rpc_cred *ret;
- ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask());
+ ret = kmalloc_obj(*ret, rpc_task_gfp_mask());
if (!ret) {
if (!(flags & RPCAUTH_LOOKUP_ASYNC))
return ERR_PTR(-ENOMEM);
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index caa94cf57123..0ffa4d01a938 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -25,6 +25,22 @@ unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt)
}
/*
+ * Helper function to nullify backchannel server pointer in transport.
+ * We need to synchronize setting the pointer to NULL (done so after
+ * the backchannel server is shutdown) with the usage of that pointer
+ * by the backchannel request processing routines
+ * xprt_complete_bc_request() and rpcrdma_bc_receive_call().
+ */
+void xprt_svc_destroy_nullify_bc(struct rpc_xprt *xprt, struct svc_serv **serv)
+{
+ spin_lock(&xprt->bc_pa_lock);
+ svc_destroy(serv);
+ xprt->bc_serv = NULL;
+ spin_unlock(&xprt->bc_pa_lock);
+}
+EXPORT_SYMBOL_GPL(xprt_svc_destroy_nullify_bc);
+
+/*
* Helper routines that track the number of preallocation elements
* on the transport.
*/
@@ -78,7 +94,7 @@ static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt)
struct rpc_rqst *req;
/* Pre-allocate one backchannel rpc_rqst */
- req = kzalloc(sizeof(*req), gfp_flags);
+ req = kzalloc_obj(*req, gfp_flags);
if (req == NULL)
return NULL;
@@ -131,7 +147,7 @@ EXPORT_SYMBOL_GPL(xprt_setup_backchannel);
int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
{
struct rpc_rqst *req;
- struct list_head tmp_list;
+ LIST_HEAD(tmp_list);
int i;
dprintk("RPC: setup backchannel transport\n");
@@ -147,7 +163,6 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
* lock is held on the rpc_xprt struct. It also makes cleanup
* easier in case of memory allocation errors.
*/
- INIT_LIST_HEAD(&tmp_list);
for (i = 0; i < min_reqs; i++) {
/* Pre-allocate one backchannel rpc_rqst */
req = xprt_alloc_bc_req(xprt);
@@ -354,7 +369,6 @@ found:
void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
{
struct rpc_xprt *xprt = req->rq_xprt;
- struct svc_serv *bc_serv = xprt->bc_serv;
spin_lock(&xprt->bc_pa_lock);
list_del(&req->rq_bc_pa_list);
@@ -365,7 +379,21 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
dprintk("RPC: add callback request to list\n");
+ xprt_enqueue_bc_request(req);
+}
+
+void xprt_enqueue_bc_request(struct rpc_rqst *req)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct svc_serv *bc_serv;
+
xprt_get(xprt);
- lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list);
- svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
+ spin_lock(&xprt->bc_pa_lock);
+ bc_serv = xprt->bc_serv;
+ if (bc_serv) {
+ lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list);
+ svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
+ }
+ spin_unlock(&xprt->bc_pa_lock);
}
+EXPORT_SYMBOL_GPL(xprt_enqueue_bc_request);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 7ce5e28a6c03..27dd6b58b8ff 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -11,6 +11,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/file.h>
+#include <linux/hex.h>
#include <linux/slab.h>
#include <linux/signal.h>
#include <linux/sched.h>
@@ -133,9 +134,11 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
return tmp;
}
+ cache_get(new);
hlist_add_head_rcu(&new->cache_list, head);
detail->entries++;
- cache_get(new);
+ if (detail->nextcheck > new->expiry_time)
+ detail->nextcheck = new->expiry_time + 1;
spin_unlock(&detail->hash_lock);
if (freeme)
@@ -230,9 +233,9 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
spin_lock(&detail->hash_lock);
cache_entry_update(detail, tmp, new);
- hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]);
- detail->entries++;
cache_get(tmp);
+ hlist_add_head_rcu(&tmp->cache_list, &detail->hash_table[hash]);
+ detail->entries++;
cache_fresh_locked(tmp, new->expiry_time, detail);
cache_fresh_locked(old, 0, detail);
spin_unlock(&detail->hash_lock);
@@ -396,7 +399,11 @@ static struct delayed_work cache_cleaner;
void sunrpc_init_cache_detail(struct cache_detail *cd)
{
spin_lock_init(&cd->hash_lock);
- INIT_LIST_HEAD(&cd->queue);
+ INIT_LIST_HEAD(&cd->requests);
+ INIT_LIST_HEAD(&cd->readers);
+ spin_lock_init(&cd->queue_lock);
+ init_waitqueue_head(&cd->queue_wait);
+ cd->next_seqno = 1;
spin_lock(&cache_list_lock);
cd->nextcheck = 0;
cd->entries = 0;
@@ -462,24 +469,21 @@ static int cache_clean(void)
}
}
+ spin_lock(&current_detail->hash_lock);
+
/* find a non-empty bucket in the table */
- while (current_detail &&
- current_index < current_detail->hash_size &&
+ while (current_index < current_detail->hash_size &&
hlist_empty(&current_detail->hash_table[current_index]))
current_index++;
/* find a cleanable entry in the bucket and clean it, or set to next bucket */
-
- if (current_detail && current_index < current_detail->hash_size) {
+ if (current_index < current_detail->hash_size) {
struct cache_head *ch = NULL;
struct cache_detail *d;
struct hlist_head *head;
struct hlist_node *tmp;
- spin_lock(&current_detail->hash_lock);
-
/* Ok, now to clean this strand */
-
head = &current_detail->hash_table[current_index];
hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
if (current_detail->nextcheck > ch->expiry_time)
@@ -500,8 +504,10 @@ static int cache_clean(void)
spin_unlock(&cache_list_lock);
if (ch)
sunrpc_end_cache_remove_entry(ch, d);
- } else
+ } else {
+ spin_unlock(&current_detail->hash_lock);
spin_unlock(&cache_list_lock);
+ }
return rv;
}
@@ -792,31 +798,20 @@ void cache_clean_deferred(void *owner)
* On read, you get a full request, or block.
* On write, an update request is processed.
* Poll works if anything to read, and always allows write.
- *
- * Implemented by linked list of requests. Each open file has
- * a ->private that also exists in this list. New requests are added
- * to the end and may wakeup and preceding readers.
- * New readers are added to the head. If, on read, an item is found with
- * CACHE_UPCALLING clear, we free it from the list.
- *
*/
-static DEFINE_SPINLOCK(queue_lock);
-
-struct cache_queue {
- struct list_head list;
- int reader; /* if 0, then request */
-};
struct cache_request {
- struct cache_queue q;
+ struct list_head list;
struct cache_head *item;
- char * buf;
+ char *buf;
int len;
int readers;
+ u64 seqno;
};
struct cache_reader {
- struct cache_queue q;
+ struct list_head list;
int offset; /* if non-0, we have a refcnt on next request */
+ u64 next_seqno;
};
static int cache_request(struct cache_detail *detail,
@@ -831,6 +826,17 @@ static int cache_request(struct cache_detail *detail,
return PAGE_SIZE - len;
}
+static struct cache_request *
+cache_next_request(struct cache_detail *cd, u64 seqno)
+{
+ struct cache_request *rq;
+
+ list_for_each_entry(rq, &cd->requests, list)
+ if (rq->seqno >= seqno)
+ return rq;
+ return NULL;
+}
+
static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
loff_t *ppos, struct cache_detail *cd)
{
@@ -845,25 +851,18 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
inode_lock(inode); /* protect against multiple concurrent
* readers on this file */
again:
- spin_lock(&queue_lock);
+ spin_lock(&cd->queue_lock);
/* need to find next request */
- while (rp->q.list.next != &cd->queue &&
- list_entry(rp->q.list.next, struct cache_queue, list)
- ->reader) {
- struct list_head *next = rp->q.list.next;
- list_move(&rp->q.list, next);
- }
- if (rp->q.list.next == &cd->queue) {
- spin_unlock(&queue_lock);
+ rq = cache_next_request(cd, rp->next_seqno);
+ if (!rq) {
+ spin_unlock(&cd->queue_lock);
inode_unlock(inode);
WARN_ON_ONCE(rp->offset);
return 0;
}
- rq = container_of(rp->q.list.next, struct cache_request, q.list);
- WARN_ON_ONCE(rq->q.reader);
if (rp->offset == 0)
rq->readers++;
- spin_unlock(&queue_lock);
+ spin_unlock(&cd->queue_lock);
if (rq->len == 0) {
err = cache_request(cd, rq);
@@ -874,9 +873,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) {
err = -EAGAIN;
- spin_lock(&queue_lock);
- list_move(&rp->q.list, &rq->q.list);
- spin_unlock(&queue_lock);
+ rp->next_seqno = rq->seqno + 1;
} else {
if (rp->offset + count > rq->len)
count = rq->len - rp->offset;
@@ -886,26 +883,24 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
rp->offset += count;
if (rp->offset >= rq->len) {
rp->offset = 0;
- spin_lock(&queue_lock);
- list_move(&rp->q.list, &rq->q.list);
- spin_unlock(&queue_lock);
+ rp->next_seqno = rq->seqno + 1;
}
err = 0;
}
out:
if (rp->offset == 0) {
/* need to release rq */
- spin_lock(&queue_lock);
+ spin_lock(&cd->queue_lock);
rq->readers--;
if (rq->readers == 0 &&
!test_bit(CACHE_PENDING, &rq->item->flags)) {
- list_del(&rq->q.list);
- spin_unlock(&queue_lock);
+ list_del(&rq->list);
+ spin_unlock(&cd->queue_lock);
cache_put(rq->item, cd);
kfree(rq->buf);
kfree(rq);
} else
- spin_unlock(&queue_lock);
+ spin_unlock(&cd->queue_lock);
}
if (err == -EAGAIN)
goto again;
@@ -969,16 +964,13 @@ out:
return ret;
}
-static DECLARE_WAIT_QUEUE_HEAD(queue_wait);
-
static __poll_t cache_poll(struct file *filp, poll_table *wait,
struct cache_detail *cd)
{
__poll_t mask;
struct cache_reader *rp = filp->private_data;
- struct cache_queue *cq;
- poll_wait(filp, &queue_wait, wait);
+ poll_wait(filp, &cd->queue_wait, wait);
/* alway allow write */
mask = EPOLLOUT | EPOLLWRNORM;
@@ -986,15 +978,11 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait,
if (!rp)
return mask;
- spin_lock(&queue_lock);
+ spin_lock(&cd->queue_lock);
- for (cq= &rp->q; &cq->list != &cd->queue;
- cq = list_entry(cq->list.next, struct cache_queue, list))
- if (!cq->reader) {
- mask |= EPOLLIN | EPOLLRDNORM;
- break;
- }
- spin_unlock(&queue_lock);
+ if (cache_next_request(cd, rp->next_seqno))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ spin_unlock(&cd->queue_lock);
return mask;
}
@@ -1004,25 +992,20 @@ static int cache_ioctl(struct inode *ino, struct file *filp,
{
int len = 0;
struct cache_reader *rp = filp->private_data;
- struct cache_queue *cq;
+ struct cache_request *rq;
if (cmd != FIONREAD || !rp)
return -EINVAL;
- spin_lock(&queue_lock);
+ spin_lock(&cd->queue_lock);
/* only find the length remaining in current request,
* or the length of the next request
*/
- for (cq= &rp->q; &cq->list != &cd->queue;
- cq = list_entry(cq->list.next, struct cache_queue, list))
- if (!cq->reader) {
- struct cache_request *cr =
- container_of(cq, struct cache_request, q);
- len = cr->len - rp->offset;
- break;
- }
- spin_unlock(&queue_lock);
+ rq = cache_next_request(cd, rp->next_seqno);
+ if (rq)
+ len = rq->len - rp->offset;
+ spin_unlock(&cd->queue_lock);
return put_user(len, (int __user *)arg);
}
@@ -1036,17 +1019,17 @@ static int cache_open(struct inode *inode, struct file *filp,
return -EACCES;
nonseekable_open(inode, filp);
if (filp->f_mode & FMODE_READ) {
- rp = kmalloc(sizeof(*rp), GFP_KERNEL);
+ rp = kmalloc_obj(*rp);
if (!rp) {
module_put(cd->owner);
return -ENOMEM;
}
rp->offset = 0;
- rp->q.reader = 1;
+ rp->next_seqno = 0;
- spin_lock(&queue_lock);
- list_add(&rp->q.list, &cd->queue);
- spin_unlock(&queue_lock);
+ spin_lock(&cd->queue_lock);
+ list_add(&rp->list, &cd->readers);
+ spin_unlock(&cd->queue_lock);
}
if (filp->f_mode & FMODE_WRITE)
atomic_inc(&cd->writers);
@@ -1060,24 +1043,35 @@ static int cache_release(struct inode *inode, struct file *filp,
struct cache_reader *rp = filp->private_data;
if (rp) {
- spin_lock(&queue_lock);
+ struct cache_request *rq = NULL;
+
+ spin_lock(&cd->queue_lock);
if (rp->offset) {
- struct cache_queue *cq;
- for (cq= &rp->q; &cq->list != &cd->queue;
- cq = list_entry(cq->list.next, struct cache_queue, list))
- if (!cq->reader) {
- container_of(cq, struct cache_request, q)
- ->readers--;
- break;
+ struct cache_request *cr;
+
+ cr = cache_next_request(cd, rp->next_seqno);
+ if (cr) {
+ cr->readers--;
+ if (cr->readers == 0 &&
+ !test_bit(CACHE_PENDING,
+ &cr->item->flags)) {
+ list_del(&cr->list);
+ rq = cr;
}
+ }
rp->offset = 0;
}
- list_del(&rp->q.list);
- spin_unlock(&queue_lock);
+ list_del(&rp->list);
+ spin_unlock(&cd->queue_lock);
+
+ if (rq) {
+ cache_put(rq->item, cd);
+ kfree(rq->buf);
+ kfree(rq);
+ }
filp->private_data = NULL;
kfree(rp);
-
}
if (filp->f_mode & FMODE_WRITE) {
atomic_dec(&cd->writers);
@@ -1091,27 +1085,24 @@ static int cache_release(struct inode *inode, struct file *filp,
static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch)
{
- struct cache_queue *cq, *tmp;
- struct cache_request *cr;
+ struct cache_request *cr, *tmp;
LIST_HEAD(dequeued);
- spin_lock(&queue_lock);
- list_for_each_entry_safe(cq, tmp, &detail->queue, list)
- if (!cq->reader) {
- cr = container_of(cq, struct cache_request, q);
- if (cr->item != ch)
- continue;
- if (test_bit(CACHE_PENDING, &ch->flags))
- /* Lost a race and it is pending again */
- break;
- if (cr->readers != 0)
- continue;
- list_move(&cr->q.list, &dequeued);
- }
- spin_unlock(&queue_lock);
+ spin_lock(&detail->queue_lock);
+ list_for_each_entry_safe(cr, tmp, &detail->requests, list) {
+ if (cr->item != ch)
+ continue;
+ if (test_bit(CACHE_PENDING, &ch->flags))
+ /* Lost a race and it is pending again */
+ break;
+ if (cr->readers != 0)
+ continue;
+ list_move(&cr->list, &dequeued);
+ }
+ spin_unlock(&detail->queue_lock);
while (!list_empty(&dequeued)) {
- cr = list_entry(dequeued.next, struct cache_request, q.list);
- list_del(&cr->q.list);
+ cr = list_entry(dequeued.next, struct cache_request, list);
+ list_del(&cr->list);
cache_put(cr->item, detail);
kfree(cr->buf);
kfree(cr);
@@ -1223,26 +1214,26 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
if (!buf)
return -EAGAIN;
- crq = kmalloc(sizeof (*crq), GFP_KERNEL);
+ crq = kmalloc_obj(*crq);
if (!crq) {
kfree(buf);
return -EAGAIN;
}
- crq->q.reader = 0;
crq->buf = buf;
crq->len = 0;
crq->readers = 0;
- spin_lock(&queue_lock);
+ spin_lock(&detail->queue_lock);
if (test_bit(CACHE_PENDING, &h->flags)) {
crq->item = cache_get(h);
- list_add_tail(&crq->q.list, &detail->queue);
+ crq->seqno = detail->next_seqno++;
+ list_add_tail(&crq->list, &detail->requests);
trace_cache_entry_upcall(detail, h);
} else
/* Lost a race, no longer PENDING, so don't enqueue */
ret = -EAGAIN;
- spin_unlock(&queue_lock);
- wake_up(&queue_wait);
+ spin_unlock(&detail->queue_lock);
+ wake_up(&detail->queue_wait);
if (ret == -EAGAIN) {
kfree(buf);
kfree(crq);
@@ -1357,21 +1348,20 @@ static void *__cache_seq_start(struct seq_file *m, loff_t *pos)
hash = n >> 32;
entry = n & ((1LL<<32) - 1);
+ if (hash >= cd->hash_size)
+ return NULL;
+
hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list)
if (!entry--)
return ch;
- n &= ~((1LL<<32) - 1);
- do {
- hash++;
- n += 1LL<<32;
- } while(hash < cd->hash_size &&
- hlist_empty(&cd->hash_table[hash]));
- if (hash >= cd->hash_size)
- return NULL;
- *pos = n+1;
- return hlist_entry_safe(rcu_dereference_raw(
+ ch = NULL;
+ while (!ch && ++hash < cd->hash_size)
+ ch = hlist_entry_safe(rcu_dereference(
hlist_first_rcu(&cd->hash_table[hash])),
struct cache_head, cache_list);
+
+ *pos = ((long long)hash << 32) + 1;
+ return ch;
}
static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
@@ -1380,29 +1370,29 @@ static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
int hash = (*pos >> 32);
struct cache_detail *cd = m->private;
- if (p == SEQ_START_TOKEN)
+ if (p == SEQ_START_TOKEN) {
hash = 0;
- else if (ch->cache_list.next == NULL) {
- hash++;
- *pos += 1LL<<32;
- } else {
- ++*pos;
- return hlist_entry_safe(rcu_dereference_raw(
- hlist_next_rcu(&ch->cache_list)),
- struct cache_head, cache_list);
+ ch = NULL;
}
- *pos &= ~((1LL<<32) - 1);
- while (hash < cd->hash_size &&
- hlist_empty(&cd->hash_table[hash])) {
+ while (hash < cd->hash_size) {
+ if (ch)
+ ch = hlist_entry_safe(
+ rcu_dereference(
+ hlist_next_rcu(&ch->cache_list)),
+ struct cache_head, cache_list);
+ else
+ ch = hlist_entry_safe(
+ rcu_dereference(
+ hlist_first_rcu(&cd->hash_table[hash])),
+ struct cache_head, cache_list);
+ if (ch) {
+ ++*pos;
+ return ch;
+ }
hash++;
- *pos += 1LL<<32;
+ *pos = (long long)hash << 32;
}
- if (hash >= cd->hash_size)
- return NULL;
- ++*pos;
- return hlist_entry_safe(rcu_dereference_raw(
- hlist_first_rcu(&cd->hash_table[hash])),
- struct cache_head, cache_list);
+ return NULL;
}
void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos)
@@ -1743,8 +1733,7 @@ struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct ne
if (cd == NULL)
return ERR_PTR(-ENOMEM);
- cd->hash_table = kcalloc(cd->hash_size, sizeof(struct hlist_head),
- GFP_KERNEL);
+ cd->hash_table = kzalloc_objs(struct hlist_head, cd->hash_size);
if (cd->hash_table == NULL) {
kfree(cd);
return ERR_PTR(-ENOMEM);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2fe88ea79a70..bc8ca470718b 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -112,47 +112,46 @@ static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
}
}
-static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
+static int rpc_setup_pipedir_sb(struct super_block *sb,
struct rpc_clnt *clnt)
{
static uint32_t clntid;
const char *dir_name = clnt->cl_program->pipe_dir_name;
char name[15];
- struct dentry *dir, *dentry;
+ struct dentry *dir;
+ int err;
dir = rpc_d_lookup_sb(sb, dir_name);
if (dir == NULL) {
pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name);
- return dir;
+ return -ENOENT;
}
for (;;) {
snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
name[sizeof(name) - 1] = '\0';
- dentry = rpc_create_client_dir(dir, name, clnt);
- if (!IS_ERR(dentry))
+ err = rpc_create_client_dir(dir, name, clnt);
+ if (!err)
break;
- if (dentry == ERR_PTR(-EEXIST))
+ if (err == -EEXIST)
continue;
printk(KERN_INFO "RPC: Couldn't create pipefs entry"
- " %s/%s, error %ld\n",
- dir_name, name, PTR_ERR(dentry));
+ " %s/%s, error %d\n",
+ dir_name, name, err);
break;
}
dput(dir);
- return dentry;
+ return err;
}
static int
rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt)
{
- struct dentry *dentry;
-
clnt->pipefs_sb = pipefs_sb;
if (clnt->cl_program->pipe_dir_name != NULL) {
- dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ int err = rpc_setup_pipedir_sb(pipefs_sb, clnt);
+ if (err && err != -ENOENT)
+ return err;
}
return 0;
}
@@ -180,16 +179,9 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event)
static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
struct super_block *sb)
{
- struct dentry *dentry;
-
switch (event) {
case RPC_PIPEFS_MOUNT:
- dentry = rpc_setup_pipedir_sb(sb, clnt);
- if (!dentry)
- return -ENOENT;
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- break;
+ return rpc_setup_pipedir_sb(sb, clnt);
case RPC_PIPEFS_UMOUNT:
__rpc_clnt_remove_pipedir(clnt);
break;
@@ -270,9 +262,6 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt,
old = rcu_dereference_protected(clnt->cl_xprt,
lockdep_is_held(&clnt->cl_lock));
- if (!xprt_bound(xprt))
- clnt->cl_autobind = 1;
-
clnt->cl_timeout = timeout;
rcu_assign_pointer(clnt->cl_xprt, xprt);
spin_unlock(&clnt->cl_lock);
@@ -385,7 +374,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
goto out_err;
err = -ENOMEM;
- clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
+ clnt = kzalloc_obj(*clnt);
if (!clnt)
goto out_err;
clnt->cl_parent = parent ? : clnt;
@@ -512,6 +501,8 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
clnt->cl_discrtry = 1;
if (!(args->flags & RPC_CLNT_CREATE_QUIET))
clnt->cl_chatty = 1;
+ if (args->flags & RPC_CLNT_CREATE_NETUNREACH_FATAL)
+ clnt->cl_netunreach_fatal = 1;
return clnt;
}
@@ -662,6 +653,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
new->cl_noretranstimeo = clnt->cl_noretranstimeo;
new->cl_discrtry = clnt->cl_discrtry;
new->cl_chatty = clnt->cl_chatty;
+ new->cl_netunreach_fatal = clnt->cl_netunreach_fatal;
new->cl_principal = clnt->cl_principal;
new->cl_max_connect = clnt->cl_max_connect;
return new;
@@ -1195,6 +1187,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
task->tk_flags |= RPC_TASK_TIMEOUT;
if (clnt->cl_noretranstimeo)
task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
+ if (clnt->cl_netunreach_fatal)
+ task->tk_flags |= RPC_TASK_NETUNREACH_FATAL;
atomic_inc(&clnt->cl_task_count);
}
@@ -1463,12 +1457,12 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen,
switch (sap->sa_family) {
case AF_INET:
err = kernel_bind(sock,
- (struct sockaddr *)&rpc_inaddr_loopback,
+ (struct sockaddr_unsized *)&rpc_inaddr_loopback,
sizeof(rpc_inaddr_loopback));
break;
case AF_INET6:
err = kernel_bind(sock,
- (struct sockaddr *)&rpc_in6addr_loopback,
+ (struct sockaddr_unsized *)&rpc_in6addr_loopback,
sizeof(rpc_in6addr_loopback));
break;
default:
@@ -1480,7 +1474,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen,
goto out_release;
}
- err = kernel_connect(sock, sap, salen, 0);
+ err = kernel_connect(sock, (struct sockaddr_unsized *)sap, salen, 0);
if (err < 0) {
dprintk("RPC: can't connect UDP socket (%d)\n", err);
goto out_release;
@@ -2102,14 +2096,17 @@ call_bind_status(struct rpc_task *task)
case -EPROTONOSUPPORT:
trace_rpcb_bind_version_err(task);
goto retry_timeout;
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL)
+ break;
+ fallthrough;
case -ECONNREFUSED: /* connection problems */
case -ECONNRESET:
case -ECONNABORTED:
case -ENOTCONN:
case -EHOSTDOWN:
- case -ENETDOWN:
case -EHOSTUNREACH:
- case -ENETUNREACH:
case -EPIPE:
trace_rpcb_unreachable_err(task);
if (!RPC_IS_SOFTCONN(task)) {
@@ -2191,19 +2188,22 @@ call_connect_status(struct rpc_task *task)
task->tk_status = 0;
switch (status) {
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL)
+ break;
+ fallthrough;
case -ECONNREFUSED:
case -ECONNRESET:
/* A positive refusal suggests a rebind is needed. */
- if (RPC_IS_SOFTCONN(task))
- break;
if (clnt->cl_autobind) {
rpc_force_rebind(clnt);
+ if (RPC_IS_SOFTCONN(task))
+ break;
goto out_retry;
}
fallthrough;
case -ECONNABORTED:
- case -ENETDOWN:
- case -ENETUNREACH:
case -EHOSTUNREACH:
case -EPIPE:
case -EPROTO:
@@ -2455,10 +2455,13 @@ call_status(struct rpc_task *task)
trace_rpc_call_status(task);
task->tk_status = 0;
switch(status) {
- case -EHOSTDOWN:
case -ENETDOWN:
- case -EHOSTUNREACH:
case -ENETUNREACH:
+ if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL)
+ goto out_exit;
+ fallthrough;
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
case -EPERM:
if (RPC_IS_SOFTCONN(task))
goto out_exit;
@@ -2760,8 +2763,13 @@ out_verifier:
case -EPROTONOSUPPORT:
goto out_err;
case -EACCES:
- /* Re-encode with a fresh cred */
- fallthrough;
+ /* possible RPCSEC_GSS out-of-sequence event (RFC2203),
+ * reset recv state and keep waiting, don't retransmit
+ */
+ task->tk_rqstp->rq_reply_bytes_recvd = 0;
+ task->tk_status = xprt_request_enqueue_receive(task);
+ task->tk_action = call_transmit_status;
+ return -EBADMSG;
default:
goto out_garbage;
}
@@ -2968,7 +2976,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
return -EINVAL;
}
- data = kmalloc(sizeof(*data), GFP_KERNEL);
+ data = kmalloc_obj(*data);
if (!data)
return -ENOMEM;
data->xps = xprt_switch_get(xps);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index eadc00410ebc..9d349cfbc483 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -168,8 +168,9 @@ rpc_inode_setowner(struct inode *inode, void *private)
}
static void
-rpc_close_pipes(struct inode *inode)
+rpc_close_pipes(struct dentry *dentry)
{
+ struct inode *inode = dentry->d_inode;
struct rpc_pipe *pipe = RPC_I(inode)->pipe;
int need_release;
LIST_HEAD(free_list);
@@ -484,60 +485,6 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
return inode;
}
-static int __rpc_create_common(struct inode *dir, struct dentry *dentry,
- umode_t mode,
- const struct file_operations *i_fop,
- void *private)
-{
- struct inode *inode;
-
- d_drop(dentry);
- inode = rpc_get_inode(dir->i_sb, mode);
- if (!inode)
- goto out_err;
- inode->i_ino = iunique(dir->i_sb, 100);
- if (i_fop)
- inode->i_fop = i_fop;
- if (private)
- rpc_inode_setowner(inode, private);
- d_add(dentry, inode);
- return 0;
-out_err:
- printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %pd\n",
- __FILE__, __func__, dentry);
- dput(dentry);
- return -ENOMEM;
-}
-
-static int __rpc_create(struct inode *dir, struct dentry *dentry,
- umode_t mode,
- const struct file_operations *i_fop,
- void *private)
-{
- int err;
-
- err = __rpc_create_common(dir, dentry, S_IFREG | mode, i_fop, private);
- if (err)
- return err;
- fsnotify_create(dir, dentry);
- return 0;
-}
-
-static int __rpc_mkdir(struct inode *dir, struct dentry *dentry,
- umode_t mode,
- const struct file_operations *i_fop,
- void *private)
-{
- int err;
-
- err = __rpc_create_common(dir, dentry, S_IFDIR | mode, i_fop, private);
- if (err)
- return err;
- inc_nlink(dir);
- fsnotify_mkdir(dir, dentry);
- return 0;
-}
-
static void
init_pipe(struct rpc_pipe *pipe)
{
@@ -564,7 +511,7 @@ struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags)
{
struct rpc_pipe *pipe;
- pipe = kzalloc(sizeof(struct rpc_pipe), GFP_KERNEL);
+ pipe = kzalloc_obj(struct rpc_pipe);
if (!pipe)
return ERR_PTR(-ENOMEM);
init_pipe(pipe);
@@ -574,119 +521,58 @@ struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags)
}
EXPORT_SYMBOL_GPL(rpc_mkpipe_data);
-static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry,
- umode_t mode,
- const struct file_operations *i_fop,
- void *private,
- struct rpc_pipe *pipe)
+static int rpc_new_file(struct dentry *parent,
+ const char *name,
+ umode_t mode,
+ const struct file_operations *i_fop,
+ void *private)
{
- struct rpc_inode *rpci;
- int err;
+ struct dentry *dentry = simple_start_creating(parent, name);
+ struct inode *dir = parent->d_inode;
+ struct inode *inode;
- err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private);
- if (err)
- return err;
- rpci = RPC_I(d_inode(dentry));
- rpci->private = private;
- rpci->pipe = pipe;
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ inode = rpc_get_inode(dir->i_sb, S_IFREG | mode);
+ if (unlikely(!inode)) {
+ simple_done_creating(dentry);
+ return -ENOMEM;
+ }
+ inode->i_ino = iunique(dir->i_sb, 100);
+ if (i_fop)
+ inode->i_fop = i_fop;
+ rpc_inode_setowner(inode, private);
+ d_make_persistent(dentry, inode);
fsnotify_create(dir, dentry);
+ simple_done_creating(dentry);
return 0;
}
-static int __rpc_rmdir(struct inode *dir, struct dentry *dentry)
+static struct dentry *rpc_new_dir(struct dentry *parent,
+ const char *name,
+ umode_t mode)
{
- int ret;
-
- dget(dentry);
- ret = simple_rmdir(dir, dentry);
- d_drop(dentry);
- if (!ret)
- fsnotify_rmdir(dir, dentry);
- dput(dentry);
- return ret;
-}
-
-static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
-{
- int ret;
-
- dget(dentry);
- ret = simple_unlink(dir, dentry);
- d_drop(dentry);
- if (!ret)
- fsnotify_unlink(dir, dentry);
- dput(dentry);
- return ret;
-}
-
-static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
-
- rpc_close_pipes(inode);
- return __rpc_unlink(dir, dentry);
-}
+ struct dentry *dentry = simple_start_creating(parent, name);
+ struct inode *dir = parent->d_inode;
+ struct inode *inode;
-static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
- const char *name)
-{
- struct qstr q = QSTR(name);
- struct dentry *dentry = d_hash_and_lookup(parent, &q);
- if (!dentry) {
- dentry = d_alloc(parent, &q);
- if (!dentry)
- return ERR_PTR(-ENOMEM);
- }
- if (d_really_is_negative(dentry))
+ if (IS_ERR(dentry))
return dentry;
- dput(dentry);
- return ERR_PTR(-EEXIST);
-}
-
-/*
- * FIXME: This probably has races.
- */
-static void __rpc_depopulate(struct dentry *parent,
- const struct rpc_filelist *files,
- int start, int eof)
-{
- struct inode *dir = d_inode(parent);
- struct dentry *dentry;
- struct qstr name;
- int i;
-
- for (i = start; i < eof; i++) {
- name.name = files[i].name;
- name.len = strlen(files[i].name);
- dentry = d_hash_and_lookup(parent, &name);
- if (dentry == NULL)
- continue;
- if (d_really_is_negative(dentry))
- goto next;
- switch (d_inode(dentry)->i_mode & S_IFMT) {
- default:
- BUG();
- case S_IFREG:
- __rpc_unlink(dir, dentry);
- break;
- case S_IFDIR:
- __rpc_rmdir(dir, dentry);
- }
-next:
- dput(dentry);
+ inode = rpc_get_inode(dir->i_sb, S_IFDIR | mode);
+ if (unlikely(!inode)) {
+ simple_done_creating(dentry);
+ return ERR_PTR(-ENOMEM);
}
-}
-static void rpc_depopulate(struct dentry *parent,
- const struct rpc_filelist *files,
- int start, int eof)
-{
- struct inode *dir = d_inode(parent);
+ inode->i_ino = iunique(dir->i_sb, 100);
+ inc_nlink(dir);
+ d_make_persistent(dentry, inode);
+ fsnotify_mkdir(dir, dentry);
+ simple_done_creating(dentry);
- inode_lock_nested(dir, I_MUTEX_CHILD);
- __rpc_depopulate(parent, files, start, eof);
- inode_unlock(dir);
+ return dentry; // borrowed
}
static int rpc_populate(struct dentry *parent,
@@ -694,92 +580,39 @@ static int rpc_populate(struct dentry *parent,
int start, int eof,
void *private)
{
- struct inode *dir = d_inode(parent);
struct dentry *dentry;
int i, err;
- inode_lock(dir);
for (i = start; i < eof; i++) {
- dentry = __rpc_lookup_create_exclusive(parent, files[i].name);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- goto out_bad;
switch (files[i].mode & S_IFMT) {
default:
BUG();
case S_IFREG:
- err = __rpc_create(dir, dentry,
+ err = rpc_new_file(parent,
+ files[i].name,
files[i].mode,
files[i].i_fop,
private);
+ if (err)
+ goto out_bad;
break;
case S_IFDIR:
- err = __rpc_mkdir(dir, dentry,
- files[i].mode,
- NULL,
- private);
+ dentry = rpc_new_dir(parent,
+ files[i].name,
+ files[i].mode);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_bad;
+ }
}
- if (err != 0)
- goto out_bad;
}
- inode_unlock(dir);
return 0;
out_bad:
- __rpc_depopulate(parent, files, start, eof);
- inode_unlock(dir);
printk(KERN_WARNING "%s: %s failed to populate directory %pd\n",
__FILE__, __func__, parent);
return err;
}
-static struct dentry *rpc_mkdir_populate(struct dentry *parent,
- const char *name, umode_t mode, void *private,
- int (*populate)(struct dentry *, void *), void *args_populate)
-{
- struct dentry *dentry;
- struct inode *dir = d_inode(parent);
- int error;
-
- inode_lock_nested(dir, I_MUTEX_PARENT);
- dentry = __rpc_lookup_create_exclusive(parent, name);
- if (IS_ERR(dentry))
- goto out;
- error = __rpc_mkdir(dir, dentry, mode, NULL, private);
- if (error != 0)
- goto out_err;
- if (populate != NULL) {
- error = populate(dentry, args_populate);
- if (error)
- goto err_rmdir;
- }
-out:
- inode_unlock(dir);
- return dentry;
-err_rmdir:
- __rpc_rmdir(dir, dentry);
-out_err:
- dentry = ERR_PTR(error);
- goto out;
-}
-
-static int rpc_rmdir_depopulate(struct dentry *dentry,
- void (*depopulate)(struct dentry *))
-{
- struct dentry *parent;
- struct inode *dir;
- int error;
-
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- inode_lock_nested(dir, I_MUTEX_PARENT);
- if (depopulate != NULL)
- depopulate(dentry);
- error = __rpc_rmdir(dir, dentry);
- inode_unlock(dir);
- dput(parent);
- return error;
-}
-
/**
* rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace
* communication
@@ -799,11 +632,13 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
* The @private argument passed here will be available to all these methods
* from the file pointer, via RPC_I(file_inode(file))->private.
*/
-struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
+int rpc_mkpipe_dentry(struct dentry *parent, const char *name,
void *private, struct rpc_pipe *pipe)
{
- struct dentry *dentry;
struct inode *dir = d_inode(parent);
+ struct dentry *dentry;
+ struct inode *inode;
+ struct rpc_inode *rpci;
umode_t umode = S_IFIFO | 0600;
int err;
@@ -812,48 +647,52 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
if (pipe->ops->downcall == NULL)
umode &= ~0222;
- inode_lock_nested(dir, I_MUTEX_PARENT);
- dentry = __rpc_lookup_create_exclusive(parent, name);
- if (IS_ERR(dentry))
- goto out;
- err = __rpc_mkpipe_dentry(dir, dentry, umode, &rpc_pipe_fops,
- private, pipe);
- if (err)
- goto out_err;
-out:
- inode_unlock(dir);
- return dentry;
-out_err:
- dentry = ERR_PTR(err);
- printk(KERN_WARNING "%s: %s() failed to create pipe %pd/%s (errno = %d)\n",
- __FILE__, __func__, parent, name,
- err);
- goto out;
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto failed;
+ }
+
+ inode = rpc_get_inode(dir->i_sb, umode);
+ if (unlikely(!inode)) {
+ simple_done_creating(dentry);
+ err = -ENOMEM;
+ goto failed;
+ }
+ inode->i_ino = iunique(dir->i_sb, 100);
+ inode->i_fop = &rpc_pipe_fops;
+ rpci = RPC_I(inode);
+ rpci->private = private;
+ rpci->pipe = pipe;
+ rpc_inode_setowner(inode, private);
+ pipe->dentry = dentry; // borrowed
+ d_make_persistent(dentry, inode);
+ fsnotify_create(dir, dentry);
+ simple_done_creating(dentry);
+ return 0;
+
+failed:
+ pr_warn("%s() failed to create pipe %pd/%s (errno = %d)\n",
+ __func__, parent, name, err);
+ return err;
}
EXPORT_SYMBOL_GPL(rpc_mkpipe_dentry);
/**
* rpc_unlink - remove a pipe
- * @dentry: dentry for the pipe, as returned from rpc_mkpipe
+ * @pipe: the pipe to be removed
*
* After this call, lookups will no longer find the pipe, and any
* attempts to read or write using preexisting opens of the pipe will
* return -EPIPE.
*/
-int
-rpc_unlink(struct dentry *dentry)
+void
+rpc_unlink(struct rpc_pipe *pipe)
{
- struct dentry *parent;
- struct inode *dir;
- int error = 0;
-
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- inode_lock_nested(dir, I_MUTEX_PARENT);
- error = __rpc_rmpipe(dir, dentry);
- inode_unlock(dir);
- dput(parent);
- return error;
+ if (pipe->dentry) {
+ simple_recursive_removal(pipe->dentry, rpc_close_pipes);
+ pipe->dentry = NULL;
+ }
}
EXPORT_SYMBOL_GPL(rpc_unlink);
@@ -1010,31 +849,6 @@ rpc_destroy_pipe_dir_objects(struct rpc_pipe_dir_head *pdh)
pdo->pdo_ops->destroy(dir, pdo);
}
-enum {
- RPCAUTH_info,
- RPCAUTH_EOF
-};
-
-static const struct rpc_filelist authfiles[] = {
- [RPCAUTH_info] = {
- .name = "info",
- .i_fop = &rpc_info_operations,
- .mode = S_IFREG | 0400,
- },
-};
-
-static int rpc_clntdir_populate(struct dentry *dentry, void *private)
-{
- return rpc_populate(dentry,
- authfiles, RPCAUTH_info, RPCAUTH_EOF,
- private);
-}
-
-static void rpc_clntdir_depopulate(struct dentry *dentry)
-{
- rpc_depopulate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF);
-}
-
/**
* rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs
* @dentry: the parent of new directory
@@ -1046,19 +860,27 @@ static void rpc_clntdir_depopulate(struct dentry *dentry)
* information about the client, together with any "pipes" that may
* later be created using rpc_mkpipe().
*/
-struct dentry *rpc_create_client_dir(struct dentry *dentry,
- const char *name,
- struct rpc_clnt *rpc_client)
+int rpc_create_client_dir(struct dentry *dentry,
+ const char *name,
+ struct rpc_clnt *rpc_client)
{
struct dentry *ret;
+ int err;
- ret = rpc_mkdir_populate(dentry, name, 0555, NULL,
- rpc_clntdir_populate, rpc_client);
- if (!IS_ERR(ret)) {
- rpc_client->cl_pipedir_objects.pdh_dentry = ret;
- rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects);
+ ret = rpc_new_dir(dentry, name, 0555);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+ err = rpc_new_file(ret, "info", S_IFREG | 0400,
+ &rpc_info_operations, rpc_client);
+ if (err) {
+ pr_warn("%s failed to populate directory %pd\n",
+ __func__, ret);
+ simple_recursive_removal(ret, NULL);
+ return err;
}
- return ret;
+ rpc_client->cl_pipedir_objects.pdh_dentry = ret;
+ rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects);
+ return 0;
}
/**
@@ -1073,7 +895,8 @@ int rpc_remove_client_dir(struct rpc_clnt *rpc_client)
return 0;
rpc_destroy_pipe_dir_objects(&rpc_client->cl_pipedir_objects);
rpc_client->cl_pipedir_objects.pdh_dentry = NULL;
- return rpc_rmdir_depopulate(dentry, rpc_clntdir_depopulate);
+ simple_recursive_removal(dentry, NULL);
+ return 0;
}
static const struct rpc_filelist cache_pipefs_files[3] = {
@@ -1094,28 +917,25 @@ static const struct rpc_filelist cache_pipefs_files[3] = {
},
};
-static int rpc_cachedir_populate(struct dentry *dentry, void *private)
-{
- return rpc_populate(dentry,
- cache_pipefs_files, 0, 3,
- private);
-}
-
-static void rpc_cachedir_depopulate(struct dentry *dentry)
-{
- rpc_depopulate(dentry, cache_pipefs_files, 0, 3);
-}
-
struct dentry *rpc_create_cache_dir(struct dentry *parent, const char *name,
umode_t umode, struct cache_detail *cd)
{
- return rpc_mkdir_populate(parent, name, umode, NULL,
- rpc_cachedir_populate, cd);
+ struct dentry *dentry;
+
+ dentry = rpc_new_dir(parent, name, umode);
+ if (!IS_ERR(dentry)) {
+ int error = rpc_populate(dentry, cache_pipefs_files, 0, 3, cd);
+ if (error) {
+ simple_recursive_removal(dentry, NULL);
+ return ERR_PTR(error);
+ }
+ }
+ return dentry;
}
void rpc_remove_cache_dir(struct dentry *dentry)
{
- rpc_rmdir_depopulate(dentry, rpc_cachedir_depopulate);
+ simple_recursive_removal(dentry, NULL);
}
/*
@@ -1141,7 +961,6 @@ enum {
RPCAUTH_nfsd4_cb,
RPCAUTH_cache,
RPCAUTH_nfsd,
- RPCAUTH_gssd,
RPCAUTH_RootEOF
};
@@ -1178,10 +997,6 @@ static const struct rpc_filelist files[] = {
.name = "nfsd",
.mode = S_IFDIR | 0555,
},
- [RPCAUTH_gssd] = {
- .name = "gssd",
- .mode = S_IFDIR | 0555,
- },
};
/*
@@ -1190,7 +1005,7 @@ static const struct rpc_filelist files[] = {
struct dentry *rpc_d_lookup_sb(const struct super_block *sb,
const unsigned char *dir_name)
{
- return d_hash_and_lookup(sb->s_root, &QSTR(dir_name));
+ return try_lookup_noperm(&QSTR(dir_name), sb->s_root);
}
EXPORT_SYMBOL_GPL(rpc_d_lookup_sb);
@@ -1241,13 +1056,6 @@ void rpc_put_sb_net(const struct net *net)
}
EXPORT_SYMBOL_GPL(rpc_put_sb_net);
-static const struct rpc_filelist gssd_dummy_clnt_dir[] = {
- [0] = {
- .name = "clntXX",
- .mode = S_IFDIR | 0555,
- },
-};
-
static ssize_t
dummy_downcall(struct file *filp, const char __user *src, size_t len)
{
@@ -1276,14 +1084,6 @@ rpc_dummy_info_show(struct seq_file *m, void *v)
}
DEFINE_SHOW_ATTRIBUTE(rpc_dummy_info);
-static const struct rpc_filelist gssd_dummy_info_file[] = {
- [0] = {
- .name = "info",
- .i_fop = &rpc_dummy_info_fops,
- .mode = S_IFREG | 0400,
- },
-};
-
/**
* rpc_gssd_dummy_populate - create a dummy gssd pipe
* @root: root of the rpc_pipefs filesystem
@@ -1292,69 +1092,32 @@ static const struct rpc_filelist gssd_dummy_info_file[] = {
* Create a dummy set of directories and a pipe that gssd can hold open to
* indicate that it is up and running.
*/
-static struct dentry *
+static int
rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
{
- int ret = 0;
- struct dentry *gssd_dentry;
- struct dentry *clnt_dentry = NULL;
- struct dentry *pipe_dentry = NULL;
-
- /* We should never get this far if "gssd" doesn't exist */
- gssd_dentry = d_hash_and_lookup(root, &QSTR(files[RPCAUTH_gssd].name));
- if (!gssd_dentry)
- return ERR_PTR(-ENOENT);
-
- ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL);
- if (ret) {
- pipe_dentry = ERR_PTR(ret);
- goto out;
- }
-
- clnt_dentry = d_hash_and_lookup(gssd_dentry,
- &QSTR(gssd_dummy_clnt_dir[0].name));
- if (!clnt_dentry) {
- __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
- pipe_dentry = ERR_PTR(-ENOENT);
- goto out;
- }
+ struct dentry *gssd_dentry, *clnt_dentry;
+ int err;
- ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL);
- if (ret) {
- __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
- pipe_dentry = ERR_PTR(ret);
- goto out;
- }
+ gssd_dentry = rpc_new_dir(root, "gssd", 0555);
+ if (IS_ERR(gssd_dentry))
+ return -ENOENT;
- pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data);
- if (IS_ERR(pipe_dentry)) {
- __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1);
- __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
- }
-out:
- dput(clnt_dentry);
- dput(gssd_dentry);
- return pipe_dentry;
-}
+ clnt_dentry = rpc_new_dir(gssd_dentry, "clntXX", 0555);
+ if (IS_ERR(clnt_dentry))
+ return -ENOENT;
-static void
-rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
-{
- struct dentry *clnt_dir = pipe_dentry->d_parent;
- struct dentry *gssd_dir = clnt_dir->d_parent;
-
- dget(pipe_dentry);
- __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry);
- __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
- __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);
- dput(pipe_dentry);
+ err = rpc_new_file(clnt_dentry, "info", 0400,
+ &rpc_dummy_info_fops, NULL);
+ if (!err)
+ err = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data);
+ return err;
}
static int
rpc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
- struct dentry *root, *gssd_dentry;
+ struct dentry *root;
struct net *net = sb->s_fs_info;
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int err;
@@ -1363,7 +1126,7 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = RPCAUTH_GSSMAGIC;
sb->s_op = &s_ops;
- sb->s_d_op = &simple_dentry_operations;
+ sb->s_d_flags = DCACHE_DONTCACHE;
sb->s_time_gran = 1;
inode = rpc_get_inode(sb, S_IFDIR | 0555);
@@ -1373,11 +1136,9 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc)
if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL))
return -ENOMEM;
- gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy);
- if (IS_ERR(gssd_dentry)) {
- __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF);
- return PTR_ERR(gssd_dentry);
- }
+ err = rpc_gssd_dummy_populate(root, sn->gssd_dummy);
+ if (err)
+ return err;
dprintk("RPC: sending pipefs MOUNT notification for net %x%s\n",
net->ns.inum, NET_NAME(net));
@@ -1386,18 +1147,6 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc)
err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
RPC_PIPEFS_MOUNT,
sb);
- if (err)
- goto err_depopulate;
- mutex_unlock(&sn->pipefs_sb_lock);
- return 0;
-
-err_depopulate:
- rpc_gssd_dummy_depopulate(gssd_dentry);
- blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
- RPC_PIPEFS_UMOUNT,
- sb);
- sn->pipefs_sb = NULL;
- __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF);
mutex_unlock(&sn->pipefs_sb_lock);
return err;
}
@@ -1454,7 +1203,7 @@ static void rpc_kill_sb(struct super_block *sb)
sb);
mutex_unlock(&sn->pipefs_sb_lock);
out:
- kill_litter_super(sb);
+ kill_anon_super(sb);
put_net(net);
}
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 102c3818bc54..6aa372188c86 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -737,7 +737,7 @@ void rpcb_getport_async(struct rpc_task *task)
goto bailout_nofree;
}
- map = kzalloc(sizeof(struct rpcbind_args), rpc_task_gfp_mask());
+ map = kzalloc_obj(struct rpcbind_args, rpc_task_gfp_mask());
if (!map) {
status = -ENOMEM;
goto bailout_release_client;
@@ -820,9 +820,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
}
trace_rpcb_setport(child, map->r_status, map->r_port);
- xprt->ops->set_port(xprt, map->r_port);
- if (map->r_port)
+ if (map->r_port) {
+ xprt->ops->set_port(xprt, map->r_port);
xprt_set_bound(xprt);
+ }
}
/*
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9b45fbdc90ca..016f16ca5779 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1074,7 +1074,6 @@ int rpc_malloc(struct rpc_task *task)
rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
return 0;
}
-EXPORT_SYMBOL_GPL(rpc_malloc);
/**
* rpc_free - free RPC buffer resources allocated via rpc_malloc
@@ -1095,7 +1094,6 @@ void rpc_free(struct rpc_task *task)
else
kfree(buf);
}
-EXPORT_SYMBOL_GPL(rpc_free);
/*
* Creation and deletion of RPC task structures
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 1b2b84feeec6..d8d8842c7de5 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -27,135 +27,91 @@
struct xdr_skb_reader {
struct sk_buff *skb;
unsigned int offset;
+ bool need_checksum;
size_t count;
__wsum csum;
};
-typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to,
- size_t len);
-
/**
* xdr_skb_read_bits - copy some data bits from skb to internal buffer
* @desc: sk_buff copy helper
* @to: copy destination
* @len: number of bytes to copy
*
- * Possibly called several times to iterate over an sk_buff and copy
- * data out of it.
+ * Possibly called several times to iterate over an sk_buff and copy data out of
+ * it.
*/
static size_t
xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
{
- if (len > desc->count)
- len = desc->count;
- if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len)))
- return 0;
- desc->count -= len;
- desc->offset += len;
- return len;
-}
+ len = min(len, desc->count);
+
+ if (desc->need_checksum) {
+ __wsum csum;
+
+ csum = skb_copy_and_csum_bits(desc->skb, desc->offset, to, len);
+ desc->csum = csum_block_add(desc->csum, csum, desc->offset);
+ } else {
+ if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len)))
+ return 0;
+ }
-/**
- * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
- * @desc: sk_buff copy helper
- * @to: copy destination
- * @len: number of bytes to copy
- *
- * Same as skb_read_bits, but calculate a checksum at the same time.
- */
-static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, size_t len)
-{
- unsigned int pos;
- __wsum csum2;
-
- if (len > desc->count)
- len = desc->count;
- pos = desc->offset;
- csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len);
- desc->csum = csum_block_add(desc->csum, csum2, pos);
desc->count -= len;
desc->offset += len;
return len;
}
-/**
- * xdr_partial_copy_from_skb - copy data out of an skb
- * @xdr: target XDR buffer
- * @base: starting offset
- * @desc: sk_buff copy helper
- * @copy_actor: virtual method for copying data
- *
- */
static ssize_t
-xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor)
+xdr_partial_copy_from_skb(struct xdr_buf *xdr, struct xdr_skb_reader *desc)
{
- struct page **ppage = xdr->pages;
- unsigned int len, pglen = xdr->page_len;
- ssize_t copied = 0;
- size_t ret;
-
- len = xdr->head[0].iov_len;
- if (base < len) {
- len -= base;
- ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
- copied += ret;
- if (ret != len || !desc->count)
- goto out;
- base = 0;
- } else
- base -= len;
-
- if (unlikely(pglen == 0))
- goto copy_tail;
- if (unlikely(base >= pglen)) {
- base -= pglen;
- goto copy_tail;
- }
- if (base || xdr->page_base) {
- pglen -= base;
- base += xdr->page_base;
- ppage += base >> PAGE_SHIFT;
- base &= ~PAGE_MASK;
- }
- do {
+ struct page **ppage = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ unsigned int poff = xdr->page_base & ~PAGE_MASK;
+ unsigned int pglen = xdr->page_len;
+ ssize_t copied = 0;
+ size_t ret;
+
+ if (xdr->head[0].iov_len == 0)
+ return 0;
+
+ ret = xdr_skb_read_bits(desc, xdr->head[0].iov_base,
+ xdr->head[0].iov_len);
+ if (ret != xdr->head[0].iov_len || !desc->count)
+ return ret;
+ copied += ret;
+
+ while (pglen) {
+ unsigned int len = min(PAGE_SIZE - poff, pglen);
char *kaddr;
/* ACL likes to be lazy in allocating pages - ACLs
* are small by default but can get huge. */
if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) {
- *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+ *ppage = alloc_page(GFP_NOWAIT);
if (unlikely(*ppage == NULL)) {
if (copied == 0)
- copied = -ENOMEM;
- goto out;
+ return -ENOMEM;
+ return copied;
}
}
- len = PAGE_SIZE;
kaddr = kmap_atomic(*ppage);
- if (base) {
- len -= base;
- if (pglen < len)
- len = pglen;
- ret = copy_actor(desc, kaddr + base, len);
- base = 0;
- } else {
- if (pglen < len)
- len = pglen;
- ret = copy_actor(desc, kaddr, len);
- }
+ ret = xdr_skb_read_bits(desc, kaddr + poff, len);
flush_dcache_page(*ppage);
kunmap_atomic(kaddr);
+
copied += ret;
if (ret != len || !desc->count)
- goto out;
+ return copied;
ppage++;
- } while ((pglen -= len) != 0);
-copy_tail:
- len = xdr->tail[0].iov_len;
- if (base < len)
- copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
-out:
+ pglen -= len;
+ poff = 0;
+ }
+
+ if (xdr->tail[0].iov_len) {
+ copied += xdr_skb_read_bits(desc, xdr->tail[0].iov_base,
+ xdr->tail[0].iov_len);
+ }
+
return copied;
}
@@ -169,17 +125,22 @@ out:
*/
int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
{
- struct xdr_skb_reader desc;
-
- desc.skb = skb;
- desc.offset = 0;
- desc.count = skb->len - desc.offset;
+ struct xdr_skb_reader desc = {
+ .skb = skb,
+ .count = skb->len - desc.offset,
+ };
- if (skb_csum_unnecessary(skb))
- goto no_checksum;
+ if (skb_csum_unnecessary(skb)) {
+ if (xdr_partial_copy_from_skb(xdr, &desc) < 0)
+ return -1;
+ if (desc.count)
+ return -1;
+ return 0;
+ }
+ desc.need_checksum = true;
desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
- if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_and_csum_bits) < 0)
+ if (xdr_partial_copy_from_skb(xdr, &desc) < 0)
return -1;
if (desc.offset != skb->len) {
__wsum csum2;
@@ -194,14 +155,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
!skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev, skb);
return 0;
-no_checksum:
- if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
- return -1;
- if (desc.count)
- return -1;
- return 0;
}
-EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg,
size_t seek)
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 383860cb1d5b..7093e18ac26c 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -126,7 +126,7 @@ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt)
struct rpc_iostats *stats;
int i;
- stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL);
+ stats = kzalloc_objs(*stats, clnt->cl_maxproc);
if (stats) {
for (i = 0; i < clnt->cl_maxproc; i++)
spin_lock_init(&stats[i].om_lock);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e7f9c295d13c..576fa42e7abf 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -352,7 +352,7 @@ static int svc_pool_map_get_node(unsigned int pidx)
if (m->mode == SVC_POOL_PERNODE)
return m->pool_to[pidx];
}
- return NUMA_NO_NODE;
+ return numa_mem_id();
}
/*
* Set the given thread's cpus_allowed mask so that it
@@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net)
svc_unregister(serv, net);
rpcb_put_local(net);
}
-EXPORT_SYMBOL_GPL(svc_rpcb_cleanup);
static int svc_uses_rpcbind(struct svc_serv *serv)
{
@@ -489,7 +488,7 @@ __svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats,
unsigned int xdrsize;
unsigned int i;
- if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
+ if (!(serv = kzalloc_obj(*serv)))
return NULL;
serv->sv_name = prog->pg_name;
serv->sv_programs = prog;
@@ -524,8 +523,7 @@ __svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats,
serv->sv_nrpools = npools;
serv->sv_pools =
- kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
- GFP_KERNEL);
+ kzalloc_objs(struct svc_pool, serv->sv_nrpools);
if (!serv->sv_pools) {
kfree(serv);
return NULL;
@@ -636,24 +634,30 @@ svc_destroy(struct svc_serv **servp)
EXPORT_SYMBOL_GPL(svc_destroy);
static bool
-svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
+svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node)
{
- unsigned long pages, ret;
-
- /* bc_xprt uses fore channel allocated buffers */
- if (svc_is_backchannel(rqstp))
- return true;
-
- pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply.
- * We assume one is at most one page
- */
- WARN_ON_ONCE(pages > RPCSVC_MAXPAGES);
- if (pages > RPCSVC_MAXPAGES)
- pages = RPCSVC_MAXPAGES;
-
- ret = alloc_pages_bulk_node(GFP_KERNEL, node, pages,
- rqstp->rq_pages);
- return ret == pages;
+ rqstp->rq_maxpages = svc_serv_maxpages(serv);
+
+ /* +1 for a NULL sentinel readable by nfsd_splice_actor() */
+ rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1,
+ sizeof(struct page *),
+ GFP_KERNEL, node);
+ if (!rqstp->rq_pages)
+ return false;
+
+ /* +1 for a NULL sentinel at rq_page_end (see svc_rqst_replace_page) */
+ rqstp->rq_respages = kcalloc_node(rqstp->rq_maxpages + 1,
+ sizeof(struct page *),
+ GFP_KERNEL, node);
+ if (!rqstp->rq_respages) {
+ kfree(rqstp->rq_pages);
+ rqstp->rq_pages = NULL;
+ return false;
+ }
+
+ rqstp->rq_pages_nfree = rqstp->rq_maxpages;
+ rqstp->rq_next_page = rqstp->rq_respages + rqstp->rq_maxpages;
+ return true;
}
/*
@@ -662,20 +666,31 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
static void
svc_release_buffer(struct svc_rqst *rqstp)
{
- unsigned int i;
+ unsigned long i;
- for (i = 0; i < ARRAY_SIZE(rqstp->rq_pages); i++)
- if (rqstp->rq_pages[i])
- put_page(rqstp->rq_pages[i]);
+ if (rqstp->rq_pages) {
+ for (i = 0; i < rqstp->rq_maxpages; i++)
+ if (rqstp->rq_pages[i])
+ put_page(rqstp->rq_pages[i]);
+ kfree(rqstp->rq_pages);
+ }
+
+ if (rqstp->rq_respages) {
+ for (i = 0; i < rqstp->rq_maxpages; i++)
+ if (rqstp->rq_respages[i])
+ put_page(rqstp->rq_respages[i]);
+ kfree(rqstp->rq_respages);
+ }
}
static void
svc_rqst_free(struct svc_rqst *rqstp)
{
folio_batch_release(&rqstp->rq_fbatch);
+ kfree(rqstp->rq_bvec);
svc_release_buffer(rqstp);
- if (rqstp->rq_scratch_page)
- put_page(rqstp->rq_scratch_page);
+ if (rqstp->rq_scratch_folio)
+ folio_put(rqstp->rq_scratch_folio);
kfree(rqstp->rq_resp);
kfree(rqstp->rq_argp);
kfree(rqstp->rq_auth_data);
@@ -696,8 +711,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
rqstp->rq_server = serv;
rqstp->rq_pool = pool;
- rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0);
- if (!rqstp->rq_scratch_page)
+ rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node);
+ if (!rqstp->rq_scratch_folio)
goto out_enomem;
rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
@@ -708,7 +723,13 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
if (!rqstp->rq_resp)
goto out_enomem;
- if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node))
+ if (!svc_init_buffer(rqstp, serv, node))
+ goto out_enomem;
+
+ rqstp->rq_bvec = kcalloc_node(rqstp->rq_maxpages,
+ sizeof(struct bio_vec),
+ GFP_KERNEL, node);
+ if (!rqstp->rq_bvec)
goto out_enomem;
rqstp->rq_err = -EAGAIN; /* No error yet */
@@ -749,119 +770,101 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
WRITE_ONCE(rqstp->rq_qtime, ktime_get());
if (!task_is_running(rqstp->rq_task)) {
wake_up_process(rqstp->rq_task);
- trace_svc_wake_up(rqstp->rq_task->pid);
+ trace_svc_pool_thread_wake(pool, rqstp->rq_task->pid);
percpu_counter_inc(&pool->sp_threads_woken);
+ } else {
+ trace_svc_pool_thread_running(pool, rqstp->rq_task->pid);
}
rcu_read_unlock();
return;
}
rcu_read_unlock();
-
+ trace_svc_pool_thread_noidle(pool, 0);
}
EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
-static struct svc_pool *
-svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
-{
- return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools];
-}
-
-static struct svc_pool *
-svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool,
- unsigned int *state)
+/**
+ * svc_new_thread - spawn a new thread in the given pool
+ * @serv: the serv to which the pool belongs
+ * @pool: pool in which thread should be spawned
+ *
+ * Create a new thread inside @pool, which is a part of @serv.
+ * Caller must hold the service mutex.
+ *
+ * Returns 0 on success, or -errno on failure.
+ */
+int svc_new_thread(struct svc_serv *serv, struct svc_pool *pool)
{
- struct svc_pool *pool;
- unsigned int i;
+ struct svc_rqst *rqstp;
+ struct task_struct *task;
+ int node;
+ int err = 0;
- pool = target_pool;
+ node = svc_pool_map_get_node(pool->sp_id);
- if (!pool) {
- for (i = 0; i < serv->sv_nrpools; i++) {
- pool = &serv->sv_pools[--(*state) % serv->sv_nrpools];
- if (pool->sp_nrthreads)
- break;
- }
+ rqstp = svc_prepare_thread(serv, pool, node);
+ if (!rqstp)
+ return -ENOMEM;
+ task = kthread_create_on_node(serv->sv_threadfn, rqstp,
+ node, "%s", serv->sv_name);
+ if (IS_ERR(task)) {
+ err = PTR_ERR(task);
+ goto out;
}
- if (pool && pool->sp_nrthreads) {
- set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
- set_bit(SP_NEED_VICTIM, &pool->sp_flags);
- return pool;
- }
- return NULL;
+ rqstp->rq_task = task;
+ if (serv->sv_nrpools > 1)
+ svc_pool_map_set_cpumask(task, pool->sp_id);
+
+ svc_sock_update_bufs(serv);
+ wake_up_process(task);
+
+ /* Wait for the thread to signal initialization status */
+ wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
+ err = rqstp->rq_err;
+out:
+ if (err)
+ svc_exit_thread(rqstp);
+ return err;
}
+EXPORT_SYMBOL_GPL(svc_new_thread);
static int
svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
- struct svc_rqst *rqstp;
- struct task_struct *task;
- struct svc_pool *chosen_pool;
- unsigned int state = serv->sv_nrthreads-1;
- int node;
- int err;
-
- do {
- nrservs--;
- chosen_pool = svc_pool_next(serv, pool, &state);
- node = svc_pool_map_get_node(chosen_pool->sp_id);
-
- rqstp = svc_prepare_thread(serv, chosen_pool, node);
- if (!rqstp)
- return -ENOMEM;
- task = kthread_create_on_node(serv->sv_threadfn, rqstp,
- node, "%s", serv->sv_name);
- if (IS_ERR(task)) {
- svc_exit_thread(rqstp);
- return PTR_ERR(task);
- }
-
- rqstp->rq_task = task;
- if (serv->sv_nrpools > 1)
- svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
+ int err = 0;
- svc_sock_update_bufs(serv);
- wake_up_process(task);
+ while (!err && nrservs--)
+ err = svc_new_thread(serv, pool);
- wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
- err = rqstp->rq_err;
- if (err) {
- svc_exit_thread(rqstp);
- return err;
- }
- } while (nrservs > 0);
-
- return 0;
+ return err;
}
static int
svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
- unsigned int state = serv->sv_nrthreads-1;
- struct svc_pool *victim;
-
do {
- victim = svc_pool_victim(serv, pool, &state);
- if (!victim)
- break;
- svc_pool_wake_idle_thread(victim);
- wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS,
- TASK_IDLE);
+ set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
+ set_bit(SP_NEED_VICTIM, &pool->sp_flags);
+ svc_pool_wake_idle_thread(pool);
+ wait_on_bit(&pool->sp_flags, SP_VICTIM_REMAINS, TASK_IDLE);
nrservs++;
} while (nrservs < 0);
return 0;
}
/**
- * svc_set_num_threads - adjust number of threads per RPC service
+ * svc_set_pool_threads - adjust number of threads per pool
* @serv: RPC service to adjust
- * @pool: Specific pool from which to choose threads, or NULL
- * @nrservs: New number of threads for @serv (0 or less means kill all threads)
+ * @pool: Specific pool from which to choose threads
+ * @min_threads: min number of threads to run in @pool
+ * @max_threads: max number of threads in @pool (0 means kill all threads)
*
- * Create or destroy threads to make the number of threads for @serv the
- * given number. If @pool is non-NULL, change only threads in that pool;
- * otherwise, round-robin between all pools for @serv. @serv's
- * sv_nrthreads is adjusted for each thread created or destroyed.
+ * Create or destroy threads in @pool to bring it into an acceptable range
+ * between @min_threads and @max_threads.
+ *
+ * If @min_threads is 0 or larger than @max_threads, then it is ignored and
+ * the pool will be set to run a static @max_threads number of threads.
*
* Caller must ensure mutual exclusion between this and server startup or
* shutdown.
@@ -870,27 +873,93 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
* starting a thread.
*/
int
-svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+svc_set_pool_threads(struct svc_serv *serv, struct svc_pool *pool,
+ unsigned int min_threads, unsigned int max_threads)
{
+ int delta;
+
if (!pool)
- nrservs -= serv->sv_nrthreads;
- else
- nrservs -= pool->sp_nrthreads;
+ return -EINVAL;
- if (nrservs > 0)
- return svc_start_kthreads(serv, pool, nrservs);
- if (nrservs < 0)
- return svc_stop_kthreads(serv, pool, nrservs);
+ /* clamp min threads to the max */
+ if (min_threads > max_threads)
+ min_threads = max_threads;
+
+ pool->sp_nrthrmin = min_threads;
+ pool->sp_nrthrmax = max_threads;
+
+ /*
+ * When min_threads is set, then only change the number of
+ * threads to bring it within an acceptable range.
+ */
+ if (min_threads) {
+ if (pool->sp_nrthreads > max_threads)
+ delta = max_threads;
+ else if (pool->sp_nrthreads < min_threads)
+ delta = min_threads;
+ else
+ return 0;
+ } else {
+ delta = max_threads;
+ }
+
+ delta -= pool->sp_nrthreads;
+ if (delta > 0)
+ return svc_start_kthreads(serv, pool, delta);
+ if (delta < 0)
+ return svc_stop_kthreads(serv, pool, delta);
return 0;
}
+EXPORT_SYMBOL_GPL(svc_set_pool_threads);
+
+/**
+ * svc_set_num_threads - adjust number of threads in serv
+ * @serv: RPC service to adjust
+ * @min_threads: min number of threads to run per pool
+ * @nrservs: New number of threads for @serv (0 means kill all threads)
+ *
+ * Create or destroy threads in @serv to bring it to @nrservs. If there
+ * are multiple pools then the new threads or victims will be distributed
+ * evenly among them.
+ *
+ * Caller must ensure mutual exclusion between this and server startup or
+ * shutdown.
+ *
+ * Returns zero on success or a negative errno if an error occurred while
+ * starting a thread. On failure, some pools may have already been
+ * adjusted; the caller is responsible for recovery.
+ */
+int
+svc_set_num_threads(struct svc_serv *serv, unsigned int min_threads,
+ unsigned int nrservs)
+{
+ unsigned int base = nrservs / serv->sv_nrpools;
+ unsigned int remain = nrservs % serv->sv_nrpools;
+ int i, err = 0;
+
+ for (i = 0; i < serv->sv_nrpools; ++i) {
+ struct svc_pool *pool = &serv->sv_pools[i];
+ int threads = base;
+
+ if (remain) {
+ ++threads;
+ --remain;
+ }
+
+ err = svc_set_pool_threads(serv, pool, min_threads, threads);
+ if (err)
+ break;
+ }
+ return err;
+}
EXPORT_SYMBOL_GPL(svc_set_num_threads);
/**
- * svc_rqst_replace_page - Replace one page in rq_pages[]
+ * svc_rqst_replace_page - Replace one page in rq_respages[]
* @rqstp: svc_rqst with pages to replace
* @page: replacement page
*
- * When replacing a page in rq_pages, batch the release of the
+ * When replacing a page in rq_respages, batch the release of the
* replaced pages to avoid hammering the page allocator.
*
* Return values:
@@ -899,19 +968,16 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads);
*/
bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page)
{
- struct page **begin = rqstp->rq_pages;
- struct page **end = &rqstp->rq_pages[RPCSVC_MAXPAGES];
+ struct page **begin = rqstp->rq_respages;
+ struct page **end = rqstp->rq_page_end;
if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) {
trace_svc_replace_page_err(rqstp);
return false;
}
- if (*rqstp->rq_next_page) {
- if (!folio_batch_add(&rqstp->rq_fbatch,
- page_folio(*rqstp->rq_next_page)))
- __folio_batch_release(&rqstp->rq_fbatch);
- }
+ if (*rqstp->rq_next_page)
+ svc_rqst_page_release(rqstp, *rqstp->rq_next_page);
get_page(page);
*(rqstp->rq_next_page++) = page;
@@ -923,18 +989,24 @@ EXPORT_SYMBOL_GPL(svc_rqst_replace_page);
* svc_rqst_release_pages - Release Reply buffer pages
* @rqstp: RPC transaction context
*
- * Release response pages that might still be in flight after
- * svc_send, and any spliced filesystem-owned pages.
+ * Release response pages in the range [rq_respages, rq_next_page).
+ * NULL entries in this range are skipped, allowing transports to
+ * transfer pages to a send context before this function runs.
*/
void svc_rqst_release_pages(struct svc_rqst *rqstp)
{
- int i, count = rqstp->rq_next_page - rqstp->rq_respages;
-
- if (count) {
- release_pages(rqstp->rq_respages, count);
- for (i = 0; i < count; i++)
- rqstp->rq_respages[i] = NULL;
+ struct page **pp;
+
+ for (pp = rqstp->rq_respages; pp < rqstp->rq_next_page; pp++) {
+ if (*pp) {
+ if (!folio_batch_add(&rqstp->rq_fbatch,
+ page_folio(*pp)))
+ __folio_batch_release(&rqstp->rq_fbatch);
+ *pp = NULL;
+ }
}
+ if (rqstp->rq_fbatch.nr)
+ __folio_batch_release(&rqstp->rq_fbatch);
}
/**
@@ -1330,6 +1402,9 @@ svc_process_common(struct svc_rqst *rqstp)
int pr, rc;
__be32 *p;
+ /* Reset the accept_stat for the RPC */
+ rqstp->rq_accept_statp = NULL;
+
/* Will be turned off only when NFSv4 Sessions are used */
set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
clear_bit(RQ_DROPME, &rqstp->rq_flags);
@@ -1369,9 +1444,8 @@ svc_process_common(struct svc_rqst *rqstp)
case SVC_OK:
break;
case SVC_GARBAGE:
- goto err_garbage_args;
- case SVC_SYSERR:
- goto err_system_err;
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ goto err_bad_auth;
case SVC_DENIED:
goto err_bad_auth;
case SVC_CLOSE:
@@ -1382,7 +1456,8 @@ svc_process_common(struct svc_rqst *rqstp)
goto sendit;
default:
pr_warn_once("Unexpected svc_auth_status (%d)\n", auth_res);
- goto err_system_err;
+ rqstp->rq_auth_stat = rpc_autherr_failed;
+ goto err_bad_auth;
}
if (progp == NULL)
@@ -1419,8 +1494,6 @@ svc_process_common(struct svc_rqst *rqstp)
/* Call the function that processes the request. */
rc = process.dispatch(rqstp);
- if (procp->pc_release)
- procp->pc_release(rqstp);
xdr_finish_decode(xdr);
if (!rc)
@@ -1509,20 +1582,6 @@ err_bad_proc:
serv->sv_stats->rpcbadfmt++;
*rqstp->rq_accept_statp = rpc_proc_unavail;
goto sendit;
-
-err_garbage_args:
- svc_printk(rqstp, "failed to decode RPC header\n");
-
- if (serv->sv_stats)
- serv->sv_stats->rpcbadfmt++;
- *rqstp->rq_accept_statp = rpc_garbage_args;
- goto sendit;
-
-err_system_err:
- if (serv->sv_stats)
- serv->sv_stats->rpcbadfmt++;
- *rqstp->rq_accept_statp = rpc_system_err;
- goto sendit;
}
/*
@@ -1533,6 +1592,14 @@ static void svc_drop(struct svc_rqst *rqstp)
trace_svc_drop(rqstp);
}
+static void svc_release_rqst(struct svc_rqst *rqstp)
+{
+ const struct svc_procedure *procp = rqstp->rq_procinfo;
+
+ if (procp && procp->pc_release)
+ procp->pc_release(rqstp);
+}
+
/**
* svc_process - Execute one RPC transaction
* @rqstp: RPC transaction context
@@ -1572,9 +1639,12 @@ void svc_process(struct svc_rqst *rqstp)
if (unlikely(*p != rpc_call))
goto out_baddir;
- if (!svc_process_common(rqstp))
+ if (!svc_process_common(rqstp)) {
+ svc_release_rqst(rqstp);
goto out_drop;
+ }
svc_send(rqstp);
+ svc_release_rqst(rqstp);
return;
out_baddir:
@@ -1642,6 +1712,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
if (!proc_error) {
/* Processing error: drop the request */
xprt_free_bc_request(req);
+ svc_release_rqst(rqstp);
return;
}
/* Finally, send the reply synchronously */
@@ -1655,6 +1726,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
timeout.to_maxval = timeout.to_initval;
memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
task = rpc_run_bc_task(req, &timeout);
+ svc_release_rqst(rqstp);
if (IS_ERR(task))
return;
@@ -1714,46 +1786,6 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset,
EXPORT_SYMBOL_GPL(svc_encode_result_payload);
/**
- * svc_fill_write_vector - Construct data argument for VFS write call
- * @rqstp: svc_rqst to operate on
- * @payload: xdr_buf containing only the write data payload
- *
- * Fills in rqstp::rq_vec, and returns the number of elements.
- */
-unsigned int svc_fill_write_vector(struct svc_rqst *rqstp,
- struct xdr_buf *payload)
-{
- struct page **pages = payload->pages;
- struct kvec *first = payload->head;
- struct kvec *vec = rqstp->rq_vec;
- size_t total = payload->len;
- unsigned int i;
-
- /* Some types of transport can present the write payload
- * entirely in rq_arg.pages. In this case, @first is empty.
- */
- i = 0;
- if (first->iov_len) {
- vec[i].iov_base = first->iov_base;
- vec[i].iov_len = min_t(size_t, total, first->iov_len);
- total -= vec[i].iov_len;
- ++i;
- }
-
- while (total) {
- vec[i].iov_base = page_address(*pages);
- vec[i].iov_len = min_t(size_t, total, PAGE_SIZE);
- total -= vec[i].iov_len;
- ++i;
- ++pages;
- }
-
- WARN_ON_ONCE(i > ARRAY_SIZE(rqstp->rq_vec));
- return i;
-}
-EXPORT_SYMBOL_GPL(svc_fill_write_vector);
-
-/**
* svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call
* @rqstp: svc_rqst to operate on
* @first: buffer containing first section of pathname
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index ae25405d8bd2..b16e710926c1 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -488,6 +488,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
pool = svc_pool_for_cpu(xprt->xpt_server);
percpu_counter_inc(&pool->sp_sockets_queued);
+ xprt->xpt_qtime = ktime_get();
lwq_enqueue(&xprt->xpt_ready, &pool->sp_xprts);
svc_pool_wake_idle_thread(pool);
@@ -649,22 +650,13 @@ static void svc_check_conn_limits(struct svc_serv *serv)
}
}
-static bool svc_alloc_arg(struct svc_rqst *rqstp)
+static bool svc_fill_pages(struct svc_rqst *rqstp, struct page **pages,
+ unsigned long npages)
{
- struct svc_serv *serv = rqstp->rq_server;
- struct xdr_buf *arg = &rqstp->rq_arg;
- unsigned long pages, filled, ret;
-
- pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT;
- if (pages > RPCSVC_MAXPAGES) {
- pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n",
- pages, RPCSVC_MAXPAGES);
- /* use as many pages as possible */
- pages = RPCSVC_MAXPAGES;
- }
+ unsigned long filled, ret;
- for (filled = 0; filled < pages; filled = ret) {
- ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages);
+ for (filled = 0; filled < npages; filled = ret) {
+ ret = alloc_pages_bulk(GFP_KERNEL, npages, pages);
if (ret > filled)
/* Made progress, don't sleep yet */
continue;
@@ -674,11 +666,40 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
set_current_state(TASK_RUNNING);
return false;
}
- trace_svc_alloc_arg_err(pages, ret);
+ trace_svc_alloc_arg_err(npages, ret);
memalloc_retry_wait(GFP_KERNEL);
}
- rqstp->rq_page_end = &rqstp->rq_pages[pages];
- rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */
+ return true;
+}
+
+static bool svc_alloc_arg(struct svc_rqst *rqstp)
+{
+ struct xdr_buf *arg = &rqstp->rq_arg;
+ unsigned long pages, nfree;
+
+ pages = rqstp->rq_maxpages;
+
+ nfree = rqstp->rq_pages_nfree;
+ if (nfree) {
+ if (!svc_fill_pages(rqstp, rqstp->rq_pages, nfree))
+ return false;
+ rqstp->rq_pages_nfree = 0;
+ }
+
+ if (WARN_ON_ONCE(rqstp->rq_next_page < rqstp->rq_respages))
+ return false;
+ nfree = rqstp->rq_next_page - rqstp->rq_respages;
+ if (nfree) {
+ if (!svc_fill_pages(rqstp, rqstp->rq_respages, nfree))
+ return false;
+ }
+
+ rqstp->rq_next_page = rqstp->rq_respages;
+ rqstp->rq_page_end = &rqstp->rq_respages[pages];
+ /* svc_rqst_replace_page() dereferences *rq_next_page even
+ * at rq_page_end; NULL prevents releasing a garbage page.
+ */
+ rqstp->rq_page_end[0] = NULL;
/* Make arg->head point to first page and arg->pages point to rest */
arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
@@ -721,15 +742,21 @@ svc_thread_should_sleep(struct svc_rqst *rqstp)
return true;
}
-static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
+static bool svc_schedule_timeout(long timeo)
+{
+ return schedule_timeout(timeo ? timeo : MAX_SCHEDULE_TIMEOUT) == 0;
+}
+
+static bool svc_thread_wait_for_work(struct svc_rqst *rqstp, long timeo)
{
struct svc_pool *pool = rqstp->rq_pool;
+ bool did_timeout = false;
if (svc_thread_should_sleep(rqstp)) {
set_current_state(TASK_IDLE | TASK_FREEZABLE);
llist_add(&rqstp->rq_idle, &pool->sp_idle_threads);
if (likely(svc_thread_should_sleep(rqstp)))
- schedule();
+ did_timeout = svc_schedule_timeout(timeo);
while (!llist_del_first_this(&pool->sp_idle_threads,
&rqstp->rq_idle)) {
@@ -741,7 +768,7 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
* for this new work. This thread can safely sleep
* until woken again.
*/
- schedule();
+ did_timeout = svc_schedule_timeout(timeo);
set_current_state(TASK_IDLE | TASK_FREEZABLE);
}
__set_current_state(TASK_RUNNING);
@@ -749,6 +776,7 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
cond_resched();
}
try_to_freeze();
+ return did_timeout;
}
static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
@@ -842,25 +870,38 @@ static void svc_thread_wake_next(struct svc_rqst *rqstp)
/**
* svc_recv - Receive and process the next request on any transport
* @rqstp: an idle RPC service thread
+ * @timeo: timeout (in jiffies) (0 means infinite timeout)
*
* This code is carefully organised not to touch any cachelines in
* the shared svc_serv structure, only cachelines in the local
* svc_pool.
+ *
+ * If the timeout is 0, then the sleep will never time out.
+ *
+ * Returns -ETIMEDOUT if idle for an extended period
+ * -EBUSY if there is more work to do than available threads
+ * 0 otherwise.
*/
-void svc_recv(struct svc_rqst *rqstp)
+int svc_recv(struct svc_rqst *rqstp, long timeo)
{
struct svc_pool *pool = rqstp->rq_pool;
+ bool did_timeout;
+ int ret = 0;
if (!svc_alloc_arg(rqstp))
- return;
+ return ret;
+
+ did_timeout = svc_thread_wait_for_work(rqstp, timeo);
- svc_thread_wait_for_work(rqstp);
+ if (did_timeout && svc_thread_should_sleep(rqstp) &&
+ pool->sp_nrthrmin && pool->sp_nrthreads > pool->sp_nrthrmin)
+ ret = -ETIMEDOUT;
clear_bit(SP_TASK_PENDING, &pool->sp_flags);
if (svc_thread_should_stop(rqstp)) {
svc_thread_wake_next(rqstp);
- return;
+ return ret;
}
rqstp->rq_xprt = svc_xprt_dequeue(pool);
@@ -872,10 +913,22 @@ void svc_recv(struct svc_rqst *rqstp)
* cache information to be provided. When there are no
* idle threads, we reduce the wait time.
*/
- if (pool->sp_idle_threads.first)
+ if (pool->sp_idle_threads.first) {
rqstp->rq_chandle.thread_wait = 5 * HZ;
- else
+ } else {
rqstp->rq_chandle.thread_wait = 1 * HZ;
+ /*
+ * No idle threads: signal -EBUSY so the caller
+ * can consider spawning another thread. Use
+ * SP_TASK_STARTING to limit this signal to one
+ * thread at a time; the caller clears this flag
+ * after starting a new thread.
+ */
+ if (!did_timeout && timeo &&
+ !test_and_set_bit(SP_TASK_STARTING,
+ &pool->sp_flags))
+ ret = -EBUSY;
+ }
trace_svc_xprt_dequeue(rqstp);
svc_handle_xprt(rqstp, xprt);
@@ -894,6 +947,7 @@ void svc_recv(struct svc_rqst *rqstp)
}
}
#endif
+ return ret;
}
EXPORT_SYMBOL_GPL(svc_recv);
@@ -929,7 +983,7 @@ void svc_send(struct svc_rqst *rqstp)
*/
static void svc_age_temp_xprts(struct timer_list *t)
{
- struct svc_serv *serv = from_timer(serv, t, sv_temptimer);
+ struct svc_serv *serv = timer_container_of(serv, t, sv_temptimer);
struct svc_xprt *xprt;
struct list_head *le, *next;
@@ -1021,6 +1075,19 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
struct svc_serv *serv = xprt->xpt_server;
struct svc_deferred_req *dr;
+ /* unregister with rpcbind for when transport type is TCP or UDP.
+ */
+ if (test_bit(XPT_RPCB_UNREG, &xprt->xpt_flags)) {
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock,
+ sk_xprt);
+ struct socket *sock = svsk->sk_sock;
+
+ if (svc_register(serv, xprt->xpt_net, sock->sk->sk_family,
+ sock->sk->sk_protocol, 0) < 0)
+ pr_warn("failed to unregister %s with rpcbind\n",
+ xprt->xpt_class->xcl_name);
+ }
+
if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags))
return;
@@ -1109,6 +1176,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
* svc_xprt_destroy_all - Destroy transports associated with @serv
* @serv: RPC service to be shut down
* @net: target network namespace
+ * @unregister: true if it is OK to unregister the destroyed xprts
*
* Server threads may still be running (especially in the case where the
* service is still running in other network namespaces).
@@ -1121,7 +1189,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
* threads, we may need to wait a little while and then check again to
* see if they're done.
*/
-void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net)
+void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net,
+ bool unregister)
{
int delay = 0;
@@ -1131,6 +1200,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net)
svc_clean_up_xprts(serv, net);
msleep(delay++);
}
+
+ if (unregister)
+ svc_rpcb_cleanup(serv, net);
}
EXPORT_SYMBOL_GPL(svc_xprt_destroy_all);
@@ -1233,7 +1305,6 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp)
rqstp->rq_addrlen = dr->addrlen;
/* Save off transport header len in case we get deferred again */
rqstp->rq_daddr = dr->daddr;
- rqstp->rq_respages = rqstp->rq_pages;
rqstp->rq_xprt_ctxt = dr->xprt_ctxt;
dr->xprt_ctxt = NULL;
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 8ca98b146ec8..3be69c145d2a 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -72,7 +72,7 @@ struct auth_domain *unix_domain_find(char *name)
return rv;
}
- new = kmalloc(sizeof(*new), GFP_KERNEL);
+ new = kmalloc_obj(*new);
if (new == NULL)
return NULL;
kref_init(&new->h.ref);
@@ -143,7 +143,7 @@ static void update(struct cache_head *cnew, struct cache_head *citem)
}
static struct cache_head *ip_map_alloc(void)
{
- struct ip_map *i = kmalloc(sizeof(*i), GFP_KERNEL);
+ struct ip_map *i = kmalloc_obj(*i);
if (i)
return &i->h;
else
@@ -458,7 +458,7 @@ static void unix_gid_update(struct cache_head *cnew, struct cache_head *citem)
}
static struct cache_head *unix_gid_alloc(void)
{
- struct unix_gid *g = kmalloc(sizeof(*g), GFP_KERNEL);
+ struct unix_gid *g = kmalloc_obj(*g);
if (g)
return &g->h;
else
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 72e5a01df3d3..7be3de1a1aed 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -68,6 +68,17 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+/*
+ * For UDP:
+ * 1 for header page
+ * enough pages for RPCSVC_MAXPAYLOAD_UDP
+ * 1 in case payload is not aligned
+ * 1 for tail page
+ */
+enum {
+ SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1
+};
+
/* To-do: to avoid tying up an nfsd thread while waiting for a
* handshake request, the request could instead be deferred.
*/
@@ -257,20 +268,47 @@ svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
}
static int
-svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg)
+svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags)
{
union {
struct cmsghdr cmsg;
u8 buf[CMSG_SPACE(sizeof(u8))];
} u;
- struct socket *sock = svsk->sk_sock;
+ u8 alert[2];
+ struct kvec alert_kvec = {
+ .iov_base = alert,
+ .iov_len = sizeof(alert),
+ };
+ struct msghdr msg = {
+ .msg_flags = *msg_flags,
+ .msg_control = &u,
+ .msg_controllen = sizeof(u),
+ };
+ int ret;
+
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
+ alert_kvec.iov_len);
+ ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
+ if (ret > 0 &&
+ tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
+ iov_iter_revert(&msg.msg_iter, ret);
+ ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN);
+ }
+ return ret;
+}
+
+static int
+svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg)
+{
int ret;
+ struct socket *sock = svsk->sk_sock;
- msg->msg_control = &u;
- msg->msg_controllen = sizeof(u);
ret = sock_recvmsg(sock, msg, MSG_DONTWAIT);
- if (unlikely(msg->msg_controllen != sizeof(u)))
- ret = svc_tcp_sock_process_cmsg(sock, msg, &u.cmsg, ret);
+ if (msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR);
+ if (ret == 0 || ret == -EIO)
+ ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags);
+ }
return ret;
}
@@ -313,15 +351,13 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen,
for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE)
bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0);
- rqstp->rq_respages = &rqstp->rq_pages[i];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
iov_iter_bvec(&msg.msg_iter, ITER_DEST, bvec, i, buflen);
if (seek) {
iov_iter_advance(&msg.msg_iter, seek);
buflen -= seek;
}
- len = svc_tcp_sock_recv_cmsg(svsk, &msg);
+ len = svc_tcp_sock_recvmsg(svsk, &msg);
if (len > 0)
svc_flush_bvec(bvec, len, seek);
@@ -639,13 +675,9 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
if (len <= rqstp->rq_arg.head[0].iov_len) {
rqstp->rq_arg.head[0].iov_len = len;
rqstp->rq_arg.page_len = 0;
- rqstp->rq_respages = rqstp->rq_pages+1;
} else {
rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
- rqstp->rq_respages = rqstp->rq_pages + 1 +
- DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE);
}
- rqstp->rq_next_page = rqstp->rq_respages+1;
if (serv->sv_stats)
serv->sv_stats->netudpcnt++;
@@ -713,15 +745,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp)
if (svc_xprt_is_dead(xprt))
goto out_notconn;
- count = xdr_buf_to_bvec(rqstp->rq_bvec,
- ARRAY_SIZE(rqstp->rq_bvec), xdr);
+ count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr);
- iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
count, rqstp->rq_res.len);
err = sock_sendmsg(svsk->sk_sock, &msg);
if (err == -ECONNREFUSED) {
/* ICMP error on earlier request. */
- iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
count, rqstp->rq_res.len);
err = sock_sendmsg(svsk->sk_sock, &msg);
}
@@ -810,6 +841,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
/* data might have come in before data_ready set up */
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
+ set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags);
/* make sure we get destination address info */
switch (svsk->sk_sk->sk_family) {
@@ -956,7 +988,7 @@ static size_t svc_tcp_restore_pages(struct svc_sock *svsk,
npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = 0; i < npages; i++) {
if (rqstp->rq_pages[i] != NULL)
- put_page(rqstp->rq_pages[i]);
+ svc_rqst_page_release(rqstp, rqstp->rq_pages[i]);
BUG_ON(svsk->sk_pages[i] == NULL);
rqstp->rq_pages[i] = svsk->sk_pages[i];
svsk->sk_pages[i] = NULL;
@@ -977,6 +1009,7 @@ static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
svsk->sk_pages[i] = rqstp->rq_pages[i];
rqstp->rq_pages[i] = NULL;
}
+ rqstp->rq_pages_nfree = npages;
}
static void svc_tcp_clear_pages(struct svc_sock *svsk)
@@ -1019,7 +1052,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk,
iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen;
iov.iov_len = want;
iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want);
- len = svc_tcp_sock_recv_cmsg(svsk, &msg);
+ len = svc_tcp_sock_recvmsg(svsk, &msg);
if (len < 0)
return len;
svsk->sk_tcplen += len;
@@ -1035,9 +1068,10 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk,
return svc_sock_reclen(svsk);
err_too_large:
- net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n",
- __func__, svsk->sk_xprt.xpt_server->sv_name,
- svc_sock_reclen(svsk));
+ net_notice_ratelimited("svc: %s oversized RPC fragment (%u octets) from %pISpc\n",
+ svsk->sk_xprt.xpt_server->sv_name,
+ svc_sock_reclen(svsk),
+ (struct sockaddr *)&svsk->sk_xprt.xpt_remote);
svc_xprt_deferred_close(&svsk->sk_xprt);
err_short:
return -EAGAIN;
@@ -1198,7 +1232,7 @@ err_noclose:
* that the pages backing @xdr are unchanging.
*/
static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
- rpc_fraghdr marker, unsigned int *sentp)
+ rpc_fraghdr marker)
{
struct msghdr msg = {
.msg_flags = MSG_SPLICE_PAGES,
@@ -1207,29 +1241,24 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
void *buf;
int ret;
- *sentp = 0;
-
/* The stream record marker is copied into a temporary page
- * fragment buffer so that it can be included in rq_bvec.
+ * fragment buffer so that it can be included in sk_bvec.
*/
buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker),
GFP_KERNEL);
if (!buf)
return -ENOMEM;
memcpy(buf, &marker, sizeof(marker));
- bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker));
+ bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker));
- count = xdr_buf_to_bvec(rqstp->rq_bvec + 1,
- ARRAY_SIZE(rqstp->rq_bvec) - 1, &rqstp->rq_res);
+ count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages,
+ &rqstp->rq_res);
- iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
1 + count, sizeof(marker) + rqstp->rq_res.len);
ret = sock_sendmsg(svsk->sk_sock, &msg);
page_frag_free(buf);
- if (ret < 0)
- return ret;
- *sentp += ret;
- return 0;
+ return ret;
}
/**
@@ -1248,8 +1277,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
struct xdr_buf *xdr = &rqstp->rq_res;
rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
(u32)xdr->len);
- unsigned int sent;
- int err;
+ int sent;
svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt);
rqstp->rq_xprt_ctxt = NULL;
@@ -1257,9 +1285,9 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
mutex_lock(&xprt->xpt_mutex);
if (svc_xprt_is_dead(xprt))
goto out_notconn;
- err = svc_tcp_sendmsg(svsk, rqstp, marker, &sent);
- trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent);
- if (err < 0 || sent != (xdr->len + sizeof(marker)))
+ sent = svc_tcp_sendmsg(svsk, rqstp, marker);
+ trace_svcsock_tcp_send(xprt, sent);
+ if (sent < 0 || sent != (xdr->len + sizeof(marker)))
goto out_close;
mutex_unlock(&xprt->xpt_mutex);
return sent;
@@ -1268,10 +1296,10 @@ out_notconn:
mutex_unlock(&xprt->xpt_mutex);
return -ENOTCONN;
out_close:
- pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
+ pr_notice("rpc-srv/tcp: %s: %s %d when sending %zu bytes - shutting down socket\n",
xprt->xpt_server->sv_name,
- (err < 0) ? "got error" : "sent",
- (err < 0) ? err : sent, xdr->len);
+ (sent < 0) ? "got error" : "sent",
+ sent, xdr->len + sizeof(marker));
svc_xprt_deferred_close(xprt);
mutex_unlock(&xprt->xpt_mutex);
return -EAGAIN;
@@ -1330,6 +1358,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
if (sk->sk_state == TCP_LISTEN) {
strcpy(svsk->sk_xprt.xpt_remotebuf, "listener");
set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
+ set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags);
sk->sk_data_ready = svc_tcp_listen_data_ready;
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
} else {
@@ -1340,7 +1369,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
svsk->sk_marker = xdr_zero;
svsk->sk_tcplen = 0;
svsk->sk_datalen = 0;
- memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
+ memset(&svsk->sk_pages[0], 0,
+ svsk->sk_maxpages * sizeof(struct page *));
tcp_sock_set_nodelay(sk);
@@ -1369,6 +1399,20 @@ void svc_sock_update_bufs(struct svc_serv *serv)
spin_unlock_bh(&serv->sv_lock);
}
+static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags)
+{
+ switch (sock->type) {
+ case SOCK_STREAM:
+ /* +1 for TCP record marker */
+ if (flags & SVC_SOCK_TEMPORARY)
+ return svc_serv_maxpages(serv) + 1;
+ return 0;
+ case SOCK_DGRAM:
+ return SUNRPC_MAX_UDP_SENDPAGES;
+ }
+ return -EINVAL;
+}
+
/*
* Initialize socket for RPC use and create svc_sock struct
*/
@@ -1379,11 +1423,28 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
struct svc_sock *svsk;
struct sock *inet;
int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+ int sendpages;
+ unsigned long pages;
- svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
+ sendpages = svc_sock_sendpages(serv, sock, flags);
+ if (sendpages < 0)
+ return ERR_PTR(sendpages);
+
+ pages = svc_serv_maxpages(serv);
+ svsk = kzalloc_flex(*svsk, sk_pages, pages);
if (!svsk)
return ERR_PTR(-ENOMEM);
+ if (sendpages) {
+ svsk->sk_bvec = kzalloc_objs(*svsk->sk_bvec, sendpages);
+ if (!svsk->sk_bvec) {
+ kfree(svsk);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ svsk->sk_maxpages = pages;
+
inet = sock->sk;
if (pmap_register) {
@@ -1393,6 +1454,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
inet->sk_protocol,
ntohs(inet_sk(inet)->inet_sport));
if (err < 0) {
+ kfree(svsk->sk_bvec);
kfree(svsk);
return ERR_PTR(err);
}
@@ -1531,7 +1593,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
ip6_sock_set_v6only(sock->sk);
if (type == SOCK_STREAM)
sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */
- error = kernel_bind(sock, sin, len);
+ error = kernel_bind(sock, (struct sockaddr_unsized *)sin, len);
if (error < 0)
goto bummer;
@@ -1542,7 +1604,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
if (protocol == IPPROTO_TCP) {
sk_net_refcnt_upgrade(sock->sk);
- if ((error = kernel_listen(sock, 64)) < 0)
+ if ((error = kernel_listen(sock, SOMAXCONN)) < 0)
goto bummer;
}
@@ -1610,5 +1672,6 @@ static void svc_sock_free(struct svc_xprt *xprt)
sock_release(sock);
page_frag_cache_drain(&svsk->sk_frag_cache);
+ kfree(svsk->sk_bvec);
kfree(svsk);
}
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index 5c8ecdaaa985..a90480f80154 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -6,6 +6,7 @@
#include <linux/kobject.h>
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/xprtsock.h>
+#include <net/net_namespace.h>
#include "sysfs.h"
@@ -48,7 +49,7 @@ static struct kobject *rpc_sysfs_object_alloc(const char *name,
{
struct kobject *kobj;
- kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+ kobj = kzalloc_obj(*kobj);
if (kobj) {
kobj->kset = kset;
if (kobject_init_and_add(kobj, &rpc_sysfs_object_type,
@@ -59,6 +60,16 @@ static struct kobject *rpc_sysfs_object_alloc(const char *name,
return NULL;
}
+static inline struct rpc_clnt *
+rpc_sysfs_client_kobj_get_clnt(struct kobject *kobj)
+{
+ struct rpc_sysfs_client *c = container_of(kobj,
+ struct rpc_sysfs_client, kobject);
+ struct rpc_clnt *ret = c->clnt;
+
+ return refcount_inc_not_zero(&ret->cl_count) ? ret : NULL;
+}
+
static inline struct rpc_xprt *
rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj)
{
@@ -86,6 +97,51 @@ rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj)
return xprt_switch_get(x->xprt_switch);
}
+static ssize_t rpc_sysfs_clnt_version_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj);
+ ssize_t ret;
+
+ if (!clnt)
+ return sprintf(buf, "<closed>\n");
+
+ ret = sprintf(buf, "%u", clnt->cl_vers);
+ refcount_dec(&clnt->cl_count);
+ return ret;
+}
+
+static ssize_t rpc_sysfs_clnt_program_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj);
+ ssize_t ret;
+
+ if (!clnt)
+ return sprintf(buf, "<closed>\n");
+
+ ret = sprintf(buf, "%s", clnt->cl_program->name);
+ refcount_dec(&clnt->cl_count);
+ return ret;
+}
+
+static ssize_t rpc_sysfs_clnt_max_connect_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj);
+ ssize_t ret;
+
+ if (!clnt)
+ return sprintf(buf, "<closed>\n");
+
+ ret = sprintf(buf, "%u\n", clnt->cl_max_connect);
+ refcount_dec(&clnt->cl_count);
+ return ret;
+}
+
static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
@@ -129,6 +185,31 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj,
return ret;
}
+static const char *xprtsec_strings[] = {
+ [RPC_XPRTSEC_NONE] = "none",
+ [RPC_XPRTSEC_TLS_ANON] = "tls-anon",
+ [RPC_XPRTSEC_TLS_X509] = "tls-x509",
+};
+
+static ssize_t rpc_sysfs_xprt_xprtsec_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+ ssize_t ret;
+
+ if (!xprt) {
+ ret = sprintf(buf, "<closed>\n");
+ goto out;
+ }
+
+ ret = sprintf(buf, "%s\n", xprtsec_strings[xprt->xprtsec.policy]);
+ xprt_put(xprt);
+out:
+ return ret;
+
+}
+
static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -206,6 +287,14 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
return ret;
}
+static ssize_t rpc_sysfs_xprt_del_xprt_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "# delete this xprt\n");
+}
+
+
static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
@@ -225,6 +314,55 @@ static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj,
return ret;
}
+static ssize_t rpc_sysfs_xprt_switch_add_xprt_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "# add one xprt to this xprt_switch\n");
+}
+
+static ssize_t rpc_sysfs_xprt_switch_add_xprt_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rpc_xprt_switch *xprt_switch =
+ rpc_sysfs_xprt_switch_kobj_get_xprt(kobj);
+ struct xprt_create xprt_create_args;
+ struct rpc_xprt *xprt, *new;
+
+ if (!xprt_switch)
+ return 0;
+
+ xprt = rpc_xprt_switch_get_main_xprt(xprt_switch);
+ if (!xprt)
+ goto out;
+
+ xprt_create_args.ident = xprt->xprt_class->ident;
+ xprt_create_args.net = xprt->xprt_net;
+ xprt_create_args.dstaddr = (struct sockaddr *)&xprt->addr;
+ xprt_create_args.addrlen = xprt->addrlen;
+ xprt_create_args.servername = xprt->servername;
+ xprt_create_args.bc_xprt = xprt->bc_xprt;
+ xprt_create_args.xprtsec = xprt->xprtsec;
+ xprt_create_args.connect_timeout = xprt->connect_timeout;
+ xprt_create_args.reconnect_timeout = xprt->max_reconnect_timeout;
+
+ new = xprt_create_transport(&xprt_create_args);
+ if (IS_ERR_OR_NULL(new)) {
+ count = PTR_ERR(new);
+ goto out_put_xprt;
+ }
+
+ rpc_xprt_switch_add_xprt(xprt_switch, new);
+ xprt_put(new);
+
+out_put_xprt:
+ xprt_put(xprt);
+out:
+ xprt_switch_put(xprt_switch);
+ return count;
+}
+
static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
@@ -252,7 +390,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj,
saddr = (struct sockaddr *)&xprt->addr;
port = rpc_get_port(saddr);
- /* buf_len is the len until the first occurence of either
+ /* buf_len is the len until the first occurrence of either
* '\n' or '\0'
*/
buf_len = strcspn(buf, "\n");
@@ -260,7 +398,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj,
dst_addr = kstrndup(buf, buf_len, GFP_KERNEL);
if (!dst_addr)
goto out_err;
- saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL);
+ saved_addr = kzalloc_obj(*saved_addr);
if (!saved_addr)
goto out_err_free;
saved_addr->addr =
@@ -335,6 +473,40 @@ out_put:
return count;
}
+static ssize_t rpc_sysfs_xprt_del_xprt(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+ struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj);
+
+ if (!xprt || !xps) {
+ count = 0;
+ goto out;
+ }
+
+ if (xprt->main) {
+ count = -EINVAL;
+ goto release_tasks;
+ }
+
+ if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) {
+ count = -EINTR;
+ goto out_put;
+ }
+
+ xprt_set_offline_locked(xprt, xps);
+ xprt_delete_locked(xprt, xps);
+
+release_tasks:
+ xprt_release_write(xprt, NULL);
+out_put:
+ xprt_put(xprt);
+ xprt_switch_put(xps);
+out:
+ return count;
+}
+
int rpc_sysfs_init(void)
{
rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj);
@@ -382,39 +554,66 @@ static void rpc_sysfs_xprt_release(struct kobject *kobj)
kfree(xprt);
}
-static const void *rpc_sysfs_client_namespace(const struct kobject *kobj)
+static const struct ns_common *rpc_sysfs_client_namespace(const struct kobject *kobj)
{
- return container_of(kobj, struct rpc_sysfs_client, kobject)->net;
+ return to_ns_common(container_of(kobj, struct rpc_sysfs_client,
+ kobject)->net);
}
-static const void *rpc_sysfs_xprt_switch_namespace(const struct kobject *kobj)
+static const struct ns_common *rpc_sysfs_xprt_switch_namespace(const struct kobject *kobj)
{
- return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net;
+ return to_ns_common(container_of(kobj, struct rpc_sysfs_xprt_switch,
+ kobject)->net);
}
-static const void *rpc_sysfs_xprt_namespace(const struct kobject *kobj)
+static const struct ns_common *rpc_sysfs_xprt_namespace(const struct kobject *kobj)
{
- return container_of(kobj, struct rpc_sysfs_xprt,
- kobject)->xprt->xprt_net;
+ return to_ns_common(container_of(kobj, struct rpc_sysfs_xprt,
+ kobject)->xprt->xprt_net);
}
+static struct kobj_attribute rpc_sysfs_clnt_version = __ATTR(rpc_version,
+ 0444, rpc_sysfs_clnt_version_show, NULL);
+
+static struct kobj_attribute rpc_sysfs_clnt_program = __ATTR(program,
+ 0444, rpc_sysfs_clnt_program_show, NULL);
+
+static struct kobj_attribute rpc_sysfs_clnt_max_connect = __ATTR(max_connect,
+ 0444, rpc_sysfs_clnt_max_connect_show, NULL);
+
+static struct attribute *rpc_sysfs_rpc_clnt_attrs[] = {
+ &rpc_sysfs_clnt_version.attr,
+ &rpc_sysfs_clnt_program.attr,
+ &rpc_sysfs_clnt_max_connect.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(rpc_sysfs_rpc_clnt);
+
static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr,
0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store);
static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr,
0644, rpc_sysfs_xprt_srcaddr_show, NULL);
+static struct kobj_attribute rpc_sysfs_xprt_xprtsec = __ATTR(xprtsec,
+ 0644, rpc_sysfs_xprt_xprtsec_show, NULL);
+
static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info,
0444, rpc_sysfs_xprt_info_show, NULL);
static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state,
0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change);
+static struct kobj_attribute rpc_sysfs_xprt_del = __ATTR(del_xprt,
+ 0644, rpc_sysfs_xprt_del_xprt_show, rpc_sysfs_xprt_del_xprt);
+
static struct attribute *rpc_sysfs_xprt_attrs[] = {
&rpc_sysfs_xprt_dstaddr.attr,
&rpc_sysfs_xprt_srcaddr.attr,
+ &rpc_sysfs_xprt_xprtsec.attr,
&rpc_sysfs_xprt_info.attr,
&rpc_sysfs_xprt_change_state.attr,
+ &rpc_sysfs_xprt_del.attr,
NULL,
};
ATTRIBUTE_GROUPS(rpc_sysfs_xprt);
@@ -422,14 +621,20 @@ ATTRIBUTE_GROUPS(rpc_sysfs_xprt);
static struct kobj_attribute rpc_sysfs_xprt_switch_info =
__ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL);
+static struct kobj_attribute rpc_sysfs_xprt_switch_add_xprt =
+ __ATTR(add_xprt, 0644, rpc_sysfs_xprt_switch_add_xprt_show,
+ rpc_sysfs_xprt_switch_add_xprt_store);
+
static struct attribute *rpc_sysfs_xprt_switch_attrs[] = {
&rpc_sysfs_xprt_switch_info.attr,
+ &rpc_sysfs_xprt_switch_add_xprt.attr,
NULL,
};
ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch);
static const struct kobj_type rpc_sysfs_client_type = {
.release = rpc_sysfs_client_release,
+ .default_groups = rpc_sysfs_rpc_clnt_groups,
.sysfs_ops = &kobj_sysfs_ops,
.namespace = rpc_sysfs_client_namespace,
};
@@ -461,7 +666,7 @@ static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent,
{
struct rpc_sysfs_client *p;
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ p = kzalloc_obj(*p);
if (p) {
p->net = net;
p->kobject.kset = rpc_sunrpc_kset;
@@ -481,7 +686,7 @@ rpc_sysfs_xprt_switch_alloc(struct kobject *parent,
{
struct rpc_sysfs_xprt_switch *p;
- p = kzalloc(sizeof(*p), gfp_flags);
+ p = kzalloc_obj(*p, gfp_flags);
if (p) {
p->net = net;
p->kobject.kset = rpc_sunrpc_kset;
@@ -501,7 +706,7 @@ static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent,
{
struct rpc_sysfs_xprt *p;
- p = kzalloc(sizeof(*p), gfp_flags);
+ p = kzalloc_obj(*p, gfp_flags);
if (!p)
goto out;
p->kobject.kset = rpc_sunrpc_kset;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 4e003cb516fe..e83d5d0be78b 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -37,19 +37,6 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj)
}
EXPORT_SYMBOL_GPL(xdr_encode_netobj);
-__be32 *
-xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj)
-{
- unsigned int len;
-
- if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ)
- return NULL;
- obj->len = len;
- obj->data = (u8 *) p;
- return p + XDR_QUADLEN(len);
-}
-EXPORT_SYMBOL_GPL(xdr_decode_netobj);
-
/**
* xdr_encode_opaque_fixed - Encode fixed length opaque data
* @p: pointer to current position in XDR buffer.
@@ -102,21 +89,6 @@ xdr_encode_string(__be32 *p, const char *string)
}
EXPORT_SYMBOL_GPL(xdr_encode_string);
-__be32 *
-xdr_decode_string_inplace(__be32 *p, char **sp,
- unsigned int *lenp, unsigned int maxlen)
-{
- u32 len;
-
- len = be32_to_cpu(*p++);
- if (len > maxlen)
- return NULL;
- *lenp = len;
- *sp = (char *) p;
- return p + XDR_QUADLEN(len);
-}
-EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
-
/**
* xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf
* @buf: XDR buffer where string resides
@@ -146,7 +118,7 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
size_t i, n = xdr_buf_pagecount(buf);
if (n != 0 && buf->bvec == NULL) {
- buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp);
+ buf->bvec = kmalloc_objs(buf->bvec[0], n, gfp);
if (!buf->bvec)
return -ENOMEM;
for (i = 0; i < n; i++) {
@@ -213,6 +185,7 @@ bvec_overflow:
pr_warn_once("%s: bio_vec array overflow\n", __func__);
return count - 1;
}
+EXPORT_SYMBOL_GPL(xdr_buf_to_bvec);
/**
* xdr_inline_pages - Prepare receive buffer for a large reply
@@ -992,21 +965,18 @@ EXPORT_SYMBOL_GPL(xdr_init_encode);
* xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages
* @xdr: pointer to xdr_stream struct
* @buf: pointer to XDR buffer into which to encode data
- * @pages: list of pages to decode into
- * @rqst: pointer to controlling rpc_rqst, for debugging
*
*/
-void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
- struct page **pages, struct rpc_rqst *rqst)
+void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf)
{
xdr_reset_scratch_buffer(xdr);
xdr->buf = buf;
- xdr->page_ptr = pages;
+ xdr->page_ptr = buf->pages;
xdr->iov = NULL;
- xdr->p = page_address(*pages);
+ xdr->p = page_address(*xdr->page_ptr);
xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
- xdr->rqst = rqst;
+ xdr->rqst = NULL;
}
EXPORT_SYMBOL_GPL(xdr_init_encode_pages);
@@ -2247,88 +2217,6 @@ out:
EXPORT_SYMBOL_GPL(xdr_process_buf);
/**
- * xdr_stream_decode_opaque - Decode variable length opaque
- * @xdr: pointer to xdr_stream
- * @ptr: location to store opaque data
- * @size: size of storage buffer @ptr
- *
- * Return values:
- * On success, returns size of object stored in *@ptr
- * %-EBADMSG on XDR buffer overflow
- * %-EMSGSIZE on overflow of storage buffer @ptr
- */
-ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size)
-{
- ssize_t ret;
- void *p;
-
- ret = xdr_stream_decode_opaque_inline(xdr, &p, size);
- if (ret <= 0)
- return ret;
- memcpy(ptr, p, ret);
- return ret;
-}
-EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque);
-
-/**
- * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque
- * @xdr: pointer to xdr_stream
- * @ptr: location to store pointer to opaque data
- * @maxlen: maximum acceptable object size
- * @gfp_flags: GFP mask to use
- *
- * Return values:
- * On success, returns size of object stored in *@ptr
- * %-EBADMSG on XDR buffer overflow
- * %-EMSGSIZE if the size of the object would exceed @maxlen
- * %-ENOMEM on memory allocation failure
- */
-ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr,
- size_t maxlen, gfp_t gfp_flags)
-{
- ssize_t ret;
- void *p;
-
- ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
- if (ret > 0) {
- *ptr = kmemdup(p, ret, gfp_flags);
- if (*ptr != NULL)
- return ret;
- ret = -ENOMEM;
- }
- *ptr = NULL;
- return ret;
-}
-EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup);
-
-/**
- * xdr_stream_decode_string - Decode variable length string
- * @xdr: pointer to xdr_stream
- * @str: location to store string
- * @size: size of storage buffer @str
- *
- * Return values:
- * On success, returns length of NUL-terminated string stored in *@str
- * %-EBADMSG on XDR buffer overflow
- * %-EMSGSIZE on overflow of storage buffer @str
- */
-ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size)
-{
- ssize_t ret;
- void *p;
-
- ret = xdr_stream_decode_opaque_inline(xdr, &p, size);
- if (ret > 0) {
- memcpy(str, p, ret);
- str[ret] = '\0';
- return strlen(str);
- }
- *str = '\0';
- return ret;
-}
-EXPORT_SYMBOL_GPL(xdr_stream_decode_string);
-
-/**
* xdr_stream_decode_string_dup - Decode and duplicate variable length string
* @xdr: pointer to xdr_stream
* @str: location to store pointer to string
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 09f245cda526..48a3618cbb29 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -854,7 +854,7 @@ xprt_schedule_autodisconnect(struct rpc_xprt *xprt)
static void
xprt_init_autodisconnect(struct timer_list *t)
{
- struct rpc_xprt *xprt = from_timer(xprt, t, timer);
+ struct rpc_xprt *xprt = timer_container_of(xprt, t, timer);
if (!RB_EMPTY_ROOT(&xprt->recv_queue))
return;
@@ -1167,7 +1167,7 @@ xprt_request_enqueue_receive(struct rpc_task *task)
spin_unlock(&xprt->queue_lock);
/* Turn off autodisconnect */
- del_timer_sync(&xprt->timer);
+ timer_delete_sync(&xprt->timer);
return 0;
}
@@ -1365,7 +1365,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
INIT_LIST_HEAD(&req->rq_xmit2);
goto out;
}
- } else if (!req->rq_seqno) {
+ } else if (req->rq_seqno_count == 0) {
list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
if (pos->rq_task->tk_owner != task->tk_owner)
continue;
@@ -1663,6 +1663,22 @@ void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
}
EXPORT_SYMBOL_GPL(xprt_add_backlog);
+/**
+ * xprt_add_backlog_noncongested - queue task on backlog
+ * @xprt: transport whose backlog queue receives the task
+ * @task: task to queue
+ *
+ * Like xprt_add_backlog, but does not set XPRT_CONGESTED.
+ * For transports whose free_slot path does not synchronize
+ * with xprt_throttle_congested via reserve_lock.
+ */
+void xprt_add_backlog_noncongested(struct rpc_xprt *xprt,
+ struct rpc_task *task)
+{
+ rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init);
+}
+EXPORT_SYMBOL_GPL(xprt_add_backlog_noncongested);
+
static bool __xprt_set_rq(struct rpc_task *task, void *data)
{
struct rpc_rqst *req = data;
@@ -1709,7 +1725,7 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
goto out;
++xprt->num_reqs;
spin_unlock(&xprt->reserve_lock);
- req = kzalloc(sizeof(*req), rpc_task_gfp_mask());
+ req = kzalloc_obj(*req, rpc_task_gfp_mask());
spin_lock(&xprt->reserve_lock);
if (req != NULL)
goto out;
@@ -1829,7 +1845,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
xprt_init(xprt, net);
for (i = 0; i < num_prealloc; i++) {
- req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+ req = kzalloc_obj(struct rpc_rqst);
if (!req)
goto out_free;
list_add(&req->rq_list, &xprt->free);
@@ -1898,6 +1914,7 @@ xprt_request_init(struct rpc_task *task)
req->rq_snd_buf.bvec = NULL;
req->rq_rcv_buf.bvec = NULL;
req->rq_release_snd_buf = NULL;
+ req->rq_seqno_count = 0;
xprt_init_majortimeo(task, req, task->tk_client->cl_timeout);
trace_xprt_reserve(req);
@@ -2138,7 +2155,7 @@ static void xprt_destroy(struct rpc_xprt *xprt)
* can only run *before* del_time_sync(), never after.
*/
spin_lock(&xprt->transport_lock);
- del_timer_sync(&xprt->timer);
+ timer_delete_sync(&xprt->timer);
spin_unlock(&xprt->transport_lock);
/*
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 7e98d4dd9f10..3ba818d637be 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -92,6 +92,27 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
xprt_put(xprt);
}
+/**
+ * rpc_xprt_switch_get_main_xprt - Get the 'main' xprt for an xprt switch.
+ * @xps: pointer to struct rpc_xprt_switch.
+ */
+struct rpc_xprt *rpc_xprt_switch_get_main_xprt(struct rpc_xprt_switch *xps)
+{
+ struct rpc_xprt_iter xpi;
+ struct rpc_xprt *xprt;
+
+ xprt_iter_init_listall(&xpi, xps);
+
+ xprt = xprt_iter_get_next(&xpi);
+ while (xprt && !xprt->main) {
+ xprt_put(xprt);
+ xprt = xprt_iter_get_next(&xpi);
+ }
+
+ xprt_iter_destroy(&xpi);
+ return xprt;
+}
+
static DEFINE_IDA(rpc_xprtswitch_ids);
void xprt_multipath_cleanup_ids(void)
@@ -129,7 +150,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt,
{
struct rpc_xprt_switch *xps;
- xps = kmalloc(sizeof(*xps), gfp_flags);
+ xps = kmalloc_obj(*xps, gfp_flags);
if (xps != NULL) {
spin_lock_init(&xps->xps_lock);
kref_init(&xps->xps_kref);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 8c817e755262..2f0f9618dd05 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -9,6 +9,7 @@
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svc_xprt.h>
#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/bc_xprt.h>
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
@@ -220,7 +221,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_rep *rep)
{
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct svc_serv *bc_serv;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
struct xdr_buf *buf;
@@ -261,11 +261,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
trace_xprtrdma_cb_call(r_xprt, rqst);
/* Queue rqst for ULP's callback service */
- bc_serv = xprt->bc_serv;
- xprt_get(xprt);
- lwq_enqueue(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
-
- svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
+ xprt_enqueue_bc_request(rqst);
r_xprt->rx_stats.bcall_count++;
return;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 31434aeb8e29..7f79a0a2601e 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -244,9 +244,10 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
}
ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
+ ep->re_recv_batch = ep->re_max_requests >> 2;
ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
- ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH;
+ ep->re_attr.cap.max_recv_wr += ep->re_recv_batch;
ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
ep->re_max_rdma_segs =
@@ -268,10 +269,9 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
}
/**
- * frwr_map - Register a memory region
+ * frwr_map - Register a memory region from an xdr_buf cursor
* @r_xprt: controlling transport
- * @seg: memory region co-ordinates
- * @nsegs: number of segments remaining
+ * @cur: cursor tracking position within the xdr_buf
* @writing: true when RDMA Write will be used
* @xid: XID of RPC using the registered memory
* @mr: MR to fill in
@@ -279,34 +279,104 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
* Prepare a REG_MR Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*
- * Returns the next segment or a negative errno pointer.
- * On success, @mr is filled in.
+ * Returns 0 on success (cursor advanced past consumed data,
+ * @mr populated) or a negative errno on failure.
*/
-struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing, __be32 xid,
- struct rpcrdma_mr *mr)
+int frwr_map(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_xdr_cursor *cur,
+ bool writing, __be32 xid,
+ struct rpcrdma_mr *mr)
{
struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ const struct xdr_buf *xdrbuf = cur->xc_buf;
+ bool sg_gaps = ep->re_mrtype == IB_MR_TYPE_SG_GAPS;
+ unsigned int max_depth = ep->re_max_fr_depth;
struct ib_reg_wr *reg_wr;
int i, n, dma_nents;
struct ib_mr *ibmr;
u8 key;
- if (nsegs > ep->re_max_fr_depth)
- nsegs = ep->re_max_fr_depth;
- for (i = 0; i < nsegs;) {
- sg_set_page(&mr->mr_sg[i], seg->mr_page,
- seg->mr_len, seg->mr_offset);
-
- ++seg;
- ++i;
- if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
- continue;
- if ((i < nsegs && seg->mr_offset) ||
- offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
- break;
+ i = 0;
+
+ /* Head kvec */
+ if (!(cur->xc_flags & XC_HEAD_DONE)) {
+ const struct kvec *head = &xdrbuf->head[0];
+
+ sg_set_page(&mr->mr_sg[i],
+ virt_to_page(head->iov_base),
+ head->iov_len,
+ offset_in_page(head->iov_base));
+ cur->xc_flags |= XC_HEAD_DONE;
+ i++;
+ /* Without sg-gap support, each non-contiguous region
+ * must be registered as a separate MR. Returning
+ * here after the head kvec causes the caller to
+ * invoke frwr_map() again for the page list and
+ * tail.
+ */
+ if (!sg_gaps)
+ goto finish;
}
+
+ /* Page list */
+ if (!(cur->xc_flags & XC_PAGES_DONE) && xdrbuf->page_len) {
+ unsigned int page_base, remaining;
+ struct page **ppages;
+
+ remaining = xdrbuf->page_len - cur->xc_page_offset;
+ page_base = offset_in_page(xdrbuf->page_base +
+ cur->xc_page_offset);
+ ppages = xdrbuf->pages +
+ ((xdrbuf->page_base + cur->xc_page_offset)
+ >> PAGE_SHIFT);
+
+ while (remaining > 0 && i < max_depth) {
+ unsigned int len;
+
+ len = min_t(unsigned int,
+ PAGE_SIZE - page_base, remaining);
+ sg_set_page(&mr->mr_sg[i], *ppages,
+ len, page_base);
+ cur->xc_page_offset += len;
+ i++;
+ ppages++;
+ remaining -= len;
+
+ if (!sg_gaps && remaining > 0 &&
+ offset_in_page(page_base + len))
+ goto finish;
+ page_base = 0;
+ }
+ if (remaining == 0)
+ cur->xc_flags |= XC_PAGES_DONE;
+ } else if (!(cur->xc_flags & XC_PAGES_DONE)) {
+ cur->xc_flags |= XC_PAGES_DONE;
+ }
+
+ /* Tail kvec */
+ if (!(cur->xc_flags & XC_TAIL_DONE) && xdrbuf->tail[0].iov_len &&
+ i < max_depth) {
+ const struct kvec *tail = &xdrbuf->tail[0];
+
+ if (!sg_gaps && i > 0) {
+ struct scatterlist *prev = &mr->mr_sg[i - 1];
+
+ if (offset_in_page(prev->offset + prev->length) ||
+ offset_in_page(tail->iov_base))
+ goto finish;
+ }
+ sg_set_page(&mr->mr_sg[i],
+ virt_to_page(tail->iov_base),
+ tail->iov_len,
+ offset_in_page(tail->iov_base));
+ cur->xc_flags |= XC_TAIL_DONE;
+ i++;
+ } else if (!(cur->xc_flags & XC_TAIL_DONE) &&
+ !xdrbuf->tail[0].iov_len) {
+ cur->xc_flags |= XC_TAIL_DONE;
+ }
+
+finish:
mr->mr_dir = rpcrdma_data_dir(writing);
mr->mr_nents = i;
@@ -338,15 +408,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
mr->mr_offset = ibmr->iova;
trace_xprtrdma_mr_map(mr);
- return seg;
+ return 0;
out_dmamap_err:
trace_xprtrdma_frwr_sgerr(mr, i);
- return ERR_PTR(-EIO);
+ return -EIO;
out_mapmr_err:
trace_xprtrdma_frwr_maperr(mr, n);
- return ERR_PTR(-EIO);
+ return -EIO;
}
/**
@@ -669,9 +739,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
int frwr_wp_create(struct rpcrdma_xprt *r_xprt)
{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ep *ep = r_xprt->rx_ep;
- struct rpcrdma_mr_seg seg;
+ struct ib_reg_wr *reg_wr;
struct rpcrdma_mr *mr;
+ struct ib_mr *ibmr;
+ int dma_nents;
+ int ret;
mr = rpcrdma_mr_get(r_xprt);
if (!mr)
@@ -679,11 +753,39 @@ int frwr_wp_create(struct rpcrdma_xprt *r_xprt)
mr->mr_req = NULL;
ep->re_write_pad_mr = mr;
- seg.mr_len = XDR_UNIT;
- seg.mr_page = virt_to_page(ep->re_write_pad);
- seg.mr_offset = offset_in_page(ep->re_write_pad);
- if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr)))
- return -EIO;
+ sg_init_table(mr->mr_sg, 1);
+ sg_set_page(mr->mr_sg, virt_to_page(ep->re_write_pad),
+ XDR_UNIT, offset_in_page(ep->re_write_pad));
+
+ mr->mr_dir = DMA_FROM_DEVICE;
+ mr->mr_nents = 1;
+ dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg,
+ mr->mr_nents, mr->mr_dir);
+ if (!dma_nents) {
+ ret = -EIO;
+ goto out_mr;
+ }
+ mr->mr_device = ep->re_id->device;
+
+ ibmr = mr->mr_ibmr;
+ if (ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL,
+ PAGE_SIZE) != dma_nents) {
+ ret = -EIO;
+ goto out_unmap;
+ }
+
+ /* IOVA is not tagged with an XID; the write-pad is not RPC-specific. */
+ ib_update_fast_reg_key(ibmr, ib_inc_rkey(ibmr->rkey));
+
+ reg_wr = &mr->mr_regwr;
+ reg_wr->mr = ibmr;
+ reg_wr->key = ibmr->rkey;
+ reg_wr->access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
+
+ mr->mr_handle = ibmr->rkey;
+ mr->mr_length = ibmr->length;
+ mr->mr_offset = ibmr->iova;
+
trace_xprtrdma_mr_fastreg(mr);
mr->mr_cqe.done = frwr_wc_fastreg;
@@ -693,5 +795,16 @@ int frwr_wp_create(struct rpcrdma_xprt *r_xprt)
mr->mr_regwr.wr.opcode = IB_WR_REG_MR;
mr->mr_regwr.wr.send_flags = 0;
- return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL);
+ ret = ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL);
+ if (!ret)
+ return 0;
+
+out_unmap:
+ frwr_mr_unmap(mr);
+out_mr:
+ ep->re_write_pad_mr = NULL;
+ spin_lock(&buf->rb_lock);
+ rpcrdma_mr_push(mr, &buf->rb_mrs);
+ spin_unlock(&buf->rb_lock);
+ return ret;
}
diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c
index 28c68b5f6823..de49ad02053d 100644
--- a/net/sunrpc/xprtrdma/ib_client.c
+++ b/net/sunrpc/xprtrdma/ib_client.c
@@ -108,7 +108,7 @@ static int rpcrdma_add_one(struct ib_device *device)
{
struct rpcrdma_device *rd;
- rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+ rd = kzalloc_obj(*rd);
if (!rd)
return -ENOMEM;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 1478c41c7e9d..0e0f21974710 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -190,7 +190,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
while (len > 0) {
if (!*ppages)
- *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+ *ppages = alloc_page(GFP_NOWAIT);
if (!*ppages)
return -ENOBUFS;
ppages++;
@@ -200,67 +200,30 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
return 0;
}
-/* Convert @vec to a single SGL element.
- *
- * Returns pointer to next available SGE, and bumps the total number
- * of SGEs consumed.
- */
-static struct rpcrdma_mr_seg *
-rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
- unsigned int *n)
+static void
+rpcrdma_xdr_cursor_init(struct rpcrdma_xdr_cursor *cur,
+ const struct xdr_buf *xdrbuf,
+ unsigned int pos, enum rpcrdma_chunktype type)
{
- seg->mr_page = virt_to_page(vec->iov_base);
- seg->mr_offset = offset_in_page(vec->iov_base);
- seg->mr_len = vec->iov_len;
- ++seg;
- ++(*n);
- return seg;
+ cur->xc_buf = xdrbuf;
+ cur->xc_page_offset = 0;
+ cur->xc_flags = 0;
+
+ if (pos != 0)
+ cur->xc_flags |= XC_HEAD_DONE;
+ if (!xdrbuf->page_len)
+ cur->xc_flags |= XC_PAGES_DONE;
+ if (type == rpcrdma_readch || type == rpcrdma_writech ||
+ !xdrbuf->tail[0].iov_len)
+ cur->xc_flags |= XC_TAIL_DONE;
}
-/* Convert @xdrbuf into SGEs no larger than a page each. As they
- * are registered, these SGEs are then coalesced into RDMA segments
- * when the selected memreg mode supports it.
- *
- * Returns positive number of SGEs consumed, or a negative errno.
- */
-
-static int
-rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
- unsigned int pos, enum rpcrdma_chunktype type,
- struct rpcrdma_mr_seg *seg)
+static bool
+rpcrdma_xdr_cursor_done(const struct rpcrdma_xdr_cursor *cur)
{
- unsigned long page_base;
- unsigned int len, n;
- struct page **ppages;
-
- n = 0;
- if (pos == 0)
- seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
-
- len = xdrbuf->page_len;
- ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
- page_base = offset_in_page(xdrbuf->page_base);
- while (len) {
- seg->mr_page = *ppages;
- seg->mr_offset = page_base;
- seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
- len -= seg->mr_len;
- ++ppages;
- ++seg;
- ++n;
- page_base = 0;
- }
-
- if (type == rpcrdma_readch || type == rpcrdma_writech)
- goto out;
-
- if (xdrbuf->tail[0].iov_len)
- rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
-
-out:
- if (unlikely(n > RPCRDMA_MAX_SEGS))
- return -EIO;
- return n;
+ return (cur->xc_flags & (XC_HEAD_DONE | XC_PAGES_DONE |
+ XC_TAIL_DONE)) ==
+ (XC_HEAD_DONE | XC_PAGES_DONE | XC_TAIL_DONE);
}
static int
@@ -292,11 +255,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
return 0;
}
-static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req,
- struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing,
- struct rpcrdma_mr **mr)
+static int rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct rpcrdma_xdr_cursor *cur,
+ bool writing, struct rpcrdma_mr **mr)
{
*mr = rpcrdma_mr_pop(&req->rl_free_mrs);
if (!*mr) {
@@ -307,13 +269,13 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
}
rpcrdma_mr_push(*mr, &req->rl_registered);
- return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
+ return frwr_map(r_xprt, cur, writing, req->rl_slot.rq_xid, *mr);
out_getmr_err:
trace_xprtrdma_nomrs_err(r_xprt, req);
xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
rpcrdma_mrs_refresh(r_xprt);
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
}
/* Register and XDR encode the Read list. Supports encoding a list of read
@@ -336,10 +298,10 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
enum rpcrdma_chunktype rtype)
{
struct xdr_stream *xdr = &req->rl_stream;
- struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_xdr_cursor cur;
struct rpcrdma_mr *mr;
unsigned int pos;
- int nsegs;
+ int ret;
if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
goto done;
@@ -347,24 +309,20 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
pos = rqst->rq_snd_buf.head[0].iov_len;
if (rtype == rpcrdma_areadch)
pos = 0;
- seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
- rtype, seg);
- if (nsegs < 0)
- return nsegs;
+ rpcrdma_xdr_cursor_init(&cur, &rqst->rq_snd_buf, pos, rtype);
do {
- seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr);
- if (IS_ERR(seg))
- return PTR_ERR(seg);
+ ret = rpcrdma_mr_prepare(r_xprt, req, &cur, false, &mr);
+ if (ret)
+ return ret;
if (encode_read_segment(xdr, mr, pos) < 0)
return -EMSGSIZE;
- trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs);
+ trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr,
+ rpcrdma_xdr_cursor_done(&cur));
r_xprt->rx_stats.read_chunk_count++;
- nsegs -= mr->mr_nents;
- } while (nsegs);
+ } while (!rpcrdma_xdr_cursor_done(&cur));
done:
if (xdr_stream_encode_item_absent(xdr) < 0)
@@ -394,20 +352,16 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_ep *ep = r_xprt->rx_ep;
- struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_xdr_cursor cur;
struct rpcrdma_mr *mr;
- int nsegs, nchunks;
+ int nchunks, ret;
__be32 *segcount;
if (wtype != rpcrdma_writech)
goto done;
- seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
- rqst->rq_rcv_buf.head[0].iov_len,
- wtype, seg);
- if (nsegs < 0)
- return nsegs;
+ rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf,
+ rqst->rq_rcv_buf.head[0].iov_len, wtype);
if (xdr_stream_encode_item_present(xdr) < 0)
return -EMSGSIZE;
@@ -418,30 +372,30 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
nchunks = 0;
do {
- seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
- if (IS_ERR(seg))
- return PTR_ERR(seg);
+ ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr);
+ if (ret)
+ return ret;
if (encode_rdma_segment(xdr, mr) < 0)
return -EMSGSIZE;
- trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs);
+ trace_xprtrdma_chunk_write(rqst->rq_task, mr,
+ rpcrdma_xdr_cursor_done(&cur));
r_xprt->rx_stats.write_chunk_count++;
r_xprt->rx_stats.total_rdma_request += mr->mr_length;
nchunks++;
- nsegs -= mr->mr_nents;
- } while (nsegs);
+ } while (!rpcrdma_xdr_cursor_done(&cur));
if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) {
if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0)
return -EMSGSIZE;
trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr,
- nsegs);
+ true);
r_xprt->rx_stats.write_chunk_count++;
- r_xprt->rx_stats.total_rdma_request += mr->mr_length;
+ r_xprt->rx_stats.total_rdma_request +=
+ ep->re_write_pad_mr->mr_length;
nchunks++;
- nsegs -= mr->mr_nents;
}
/* Update count of segments in this Write chunk */
@@ -471,9 +425,9 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
enum rpcrdma_chunktype wtype)
{
struct xdr_stream *xdr = &req->rl_stream;
- struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_xdr_cursor cur;
struct rpcrdma_mr *mr;
- int nsegs, nchunks;
+ int nchunks, ret;
__be32 *segcount;
if (wtype != rpcrdma_replych) {
@@ -482,10 +436,7 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
return 0;
}
- seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
- if (nsegs < 0)
- return nsegs;
+ rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, 0, wtype);
if (xdr_stream_encode_item_present(xdr) < 0)
return -EMSGSIZE;
@@ -496,19 +447,19 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
nchunks = 0;
do {
- seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
- if (IS_ERR(seg))
- return PTR_ERR(seg);
+ ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr);
+ if (ret)
+ return ret;
if (encode_rdma_segment(xdr, mr) < 0)
return -EMSGSIZE;
- trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs);
+ trace_xprtrdma_chunk_reply(rqst->rq_task, mr,
+ rpcrdma_xdr_cursor_done(&cur));
r_xprt->rx_stats.reply_chunk_count++;
r_xprt->rx_stats.total_rdma_request += mr->mr_length;
nchunks++;
- nsegs -= mr->mr_nents;
- } while (nsegs);
+ } while (!rpcrdma_xdr_cursor_done(&cur));
/* Update count of segments in the Reply chunk */
*segcount = cpu_to_be32(nchunks);
@@ -1471,7 +1422,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_ep->re_max_requests)
credits = r_xprt->rx_ep->re_max_requests;
- rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1));
if (buf->rb_credits != credits)
rpcrdma_update_cwnd(r_xprt, credits);
@@ -1490,15 +1440,20 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
/* LocalInv completion will complete the RPC */
else
kref_put(&req->rl_kref, rpcrdma_reply_done);
- return;
-out_badversion:
- trace_xprtrdma_reply_vers_err(rep);
- goto out;
+out_post:
+ rpcrdma_post_recvs(r_xprt,
+ credits + (buf->rb_bc_srv_max_requests << 1));
+ return;
out_norqst:
spin_unlock(&xprt->queue_lock);
trace_xprtrdma_reply_rqst_err(rep);
+ rpcrdma_rep_put(buf, rep);
+ goto out_post;
+
+out_badversion:
+ trace_xprtrdma_reply_vers_err(rep);
goto out;
out_shortreply:
diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
index b63cfeaa2923..1f8f7dad8b6f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_pcl.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
@@ -29,7 +29,7 @@ static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position)
{
struct svc_rdma_chunk *chunk;
- chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL);
+ chunk = kmalloc_flex(*chunk, ch_segments, segcount);
if (!chunk)
return NULL;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 292022f0976e..f8a0638eb095 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -118,20 +118,25 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
- int node = ibdev_to_node(rdma->sc_cm_id->device);
+ struct ib_device *device = rdma->sc_cm_id->device;
+ int node = ibdev_to_node(device);
struct svc_rdma_recv_ctxt *ctxt;
+ unsigned long pages;
dma_addr_t addr;
void *buffer;
- ctxt = kzalloc_node(sizeof(*ctxt), GFP_KERNEL, node);
+ pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server);
+ ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages),
+ GFP_KERNEL, node);
if (!ctxt)
goto fail0;
+ ctxt->rc_maxpages = pages;
buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
goto fail1;
- addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
- rdma->sc_max_req_size, DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
+ addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(device, addr))
goto fail2;
svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
@@ -163,7 +168,7 @@ fail0:
static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
- ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
+ ib_dma_unmap_single(rdma->sc_cm_id->device, ctxt->rc_recv_sge.addr,
ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
kfree(ctxt->rc_recv_buf);
kfree(ctxt);
@@ -497,7 +502,7 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt)
* a computation, perform a simple range check. This is an
* arbitrary but sensible limit (ie, not architectural).
*/
- if (unlikely(segcount > RPCSVC_MAXPAGES))
+ if (unlikely(segcount > rctxt->rc_maxpages))
return false;
p = xdr_inline_decode(&rctxt->rc_stream,
@@ -857,18 +862,12 @@ static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp,
unsigned int i;
/* Transfer the Read chunk pages into @rqstp.rq_pages, replacing
- * the rq_pages that were already allocated for this rqstp.
+ * the receive buffer pages already allocated for this rqstp.
*/
- release_pages(rqstp->rq_respages, ctxt->rc_page_count);
+ release_pages(rqstp->rq_pages, ctxt->rc_page_count);
for (i = 0; i < ctxt->rc_page_count; i++)
rqstp->rq_pages[i] = ctxt->rc_pages[i];
- /* Update @rqstp's result send buffer to start after the
- * last page in the RDMA Read payload.
- */
- rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
-
/* Prevent svc_rdma_recv_ctxt_put() from releasing the
* pages in ctxt::rc_pages a second time.
*/
@@ -927,10 +926,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
struct svc_rdma_recv_ctxt *ctxt;
int ret;
- /* Prevent svc_xprt_release() from releasing pages in rq_pages
- * when returning 0 or an error.
+ /* Precaution: a zero page count on error return causes
+ * svc_rqst_release_pages() to release nothing.
*/
- rqstp->rq_respages = rqstp->rq_pages;
rqstp->rq_next_page = rqstp->rq_respages;
rqstp->rq_xprt_ctxt = NULL;
@@ -958,7 +956,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
return 0;
percpu_counter_inc(&svcrdma_stat_recv);
- ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
+ ib_dma_sync_single_for_cpu(rdma_xprt->sc_cm_id->device,
ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
DMA_FROM_DEVICE);
svc_rdma_build_arg_xdr(rqstp, ctxt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 40797114d50a..402e2ceca4ff 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -5,6 +5,8 @@
* Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
*/
+#include <linux/bvec.h>
+#include <linux/overflow.h>
#include <rdma/rw.h>
#include <linux/sunrpc/xdr.h>
@@ -20,30 +22,33 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
/* Each R/W context contains state for one chain of RDMA Read or
* Write Work Requests.
*
- * Each WR chain handles a single contiguous server-side buffer,
- * because scatterlist entries after the first have to start on
- * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ * Each WR chain handles a single contiguous server-side buffer.
+ * - each xdr_buf iovec is a single contiguous buffer
+ * - the xdr_buf pages array is a single contiguous buffer because the
+ * second through the last element always start on a page boundary
*
* Each WR chain handles only one R_key. Each RPC-over-RDMA segment
* from a client may contain a unique R_key, so each WR chain moves
* up to one segment at a time.
*
- * The scatterlist makes this data structure over 4KB in size. To
- * make it less likely to fail, and to handle the allocation for
- * smaller I/O requests without disabling bottom-halves, these
- * contexts are created on demand, but cached and reused until the
- * controlling svcxprt_rdma is destroyed.
+ * The inline bvec array is sized to handle most I/O requests without
+ * additional allocation. Larger requests fall back to dynamic allocation.
+ * These contexts are created on demand, but cached and reused until
+ * the controlling svcxprt_rdma is destroyed.
*/
struct svc_rdma_rw_ctxt {
struct llist_node rw_node;
struct list_head rw_list;
struct rdma_rw_ctx rw_ctx;
unsigned int rw_nents;
- unsigned int rw_first_sgl_nents;
- struct sg_table rw_sg_table;
- struct scatterlist rw_first_sgl[];
+ unsigned int rw_first_bvec_nents;
+ struct bio_vec *rw_bvec;
+ struct bio_vec rw_first_bvec[];
};
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt);
+
static inline struct svc_rdma_rw_ctxt *
svc_rdma_next_ctxt(struct list_head *list)
{
@@ -52,10 +57,10 @@ svc_rdma_next_ctxt(struct list_head *list)
}
static struct svc_rdma_rw_ctxt *
-svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec)
{
struct ib_device *dev = rdma->sc_cm_id->device;
- unsigned int first_sgl_nents = dev->attrs.max_send_sge;
+ unsigned int first_bvec_nents = dev->attrs.max_send_sge;
struct svc_rdma_rw_ctxt *ctxt;
struct llist_node *node;
@@ -65,33 +70,44 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
if (node) {
ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
} else {
- ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents),
+ ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec,
+ first_bvec_nents),
GFP_KERNEL, ibdev_to_node(dev));
if (!ctxt)
goto out_noctx;
INIT_LIST_HEAD(&ctxt->rw_list);
- ctxt->rw_first_sgl_nents = first_sgl_nents;
+ ctxt->rw_first_bvec_nents = first_bvec_nents;
}
- ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
- if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
- ctxt->rw_sg_table.sgl,
- first_sgl_nents))
- goto out_free;
+ if (nr_bvec <= ctxt->rw_first_bvec_nents) {
+ ctxt->rw_bvec = ctxt->rw_first_bvec;
+ } else {
+ ctxt->rw_bvec = kmalloc_array_node(nr_bvec,
+ sizeof(*ctxt->rw_bvec),
+ GFP_KERNEL,
+ ibdev_to_node(dev));
+ if (!ctxt->rw_bvec)
+ goto out_free;
+ }
return ctxt;
out_free:
- kfree(ctxt);
+ /* Return cached contexts to cache; free freshly allocated ones */
+ if (node)
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ else
+ kfree(ctxt);
out_noctx:
- trace_svcrdma_rwctx_empty(rdma, sges);
+ trace_svcrdma_rwctx_empty(rdma, nr_bvec);
return NULL;
}
static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt,
struct llist_head *list)
{
- sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents);
+ if (ctxt->rw_bvec != ctxt->rw_first_bvec)
+ kfree(ctxt->rw_bvec);
llist_add(&ctxt->rw_node, list);
}
@@ -123,6 +139,7 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
* @ctxt: R/W context to prepare
* @offset: RDMA offset
* @handle: RDMA tag/handle
+ * @length: total number of bytes in the bvec array
* @direction: I/O direction
*
* Returns on success, the number of WQEs that will be needed
@@ -130,14 +147,18 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
*/
static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
struct svc_rdma_rw_ctxt *ctxt,
- u64 offset, u32 handle,
+ u64 offset, u32 handle, unsigned int length,
enum dma_data_direction direction)
{
+ struct bvec_iter iter = {
+ .bi_size = length,
+ };
int ret;
- ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num,
- ctxt->rw_sg_table.sgl, ctxt->rw_nents,
- 0, offset, handle, direction);
+ ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num,
+ ctxt->rw_bvec, ctxt->rw_nents,
+ iter, offset, handle, direction);
if (unlikely(ret < 0)) {
trace_svcrdma_dma_map_rw_err(rdma, offset, handle,
ctxt->rw_nents, ret);
@@ -175,7 +196,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
{
struct llist_node *first, *last;
struct svc_rdma_rw_ctxt *ctxt;
- LLIST_HEAD(free);
trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount);
@@ -183,10 +203,11 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
list_del(&ctxt->rw_list);
- rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
- rdma->sc_port_num, ctxt->rw_sg_table.sgl,
- ctxt->rw_nents, dir);
- __svc_rdma_put_rw_ctxt(ctxt, &free);
+ rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num,
+ ctxt->rw_bvec, ctxt->rw_nents, dir);
+ if (ctxt->rw_bvec != ctxt->rw_first_bvec)
+ kfree(ctxt->rw_bvec);
ctxt->rw_node.next = first;
first = &ctxt->rw_node;
@@ -231,6 +252,28 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
}
/**
+ * svc_rdma_write_chunk_release - Release Write chunk I/O resources
+ * @rdma: controlling transport
+ * @ctxt: Send context that is being released
+ *
+ * Write chunk resources remain live until Send completion because
+ * Write WRs are chained to the Send WR. This function releases all
+ * write_info structures accumulated on @ctxt->sc_write_info_list.
+ */
+void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt)
+{
+ struct svc_rdma_write_info *info;
+
+ while (!list_empty(&ctxt->sc_write_info_list)) {
+ info = list_first_entry(&ctxt->sc_write_info_list,
+ struct svc_rdma_write_info, wi_list);
+ list_del(&info->wi_list);
+ svc_rdma_write_info_free(info);
+ }
+}
+
+/**
* svc_rdma_reply_chunk_release - Release Reply chunk I/O resources
* @rdma: controlling transport
* @ctxt: Send context that is being released
@@ -286,13 +329,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
struct ib_cqe *cqe = wc->wr_cqe;
struct svc_rdma_chunk_ctxt *cc =
container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
- struct svc_rdma_write_info *info =
- container_of(cc, struct svc_rdma_write_info, wi_cc);
switch (wc->status) {
case IB_WC_SUCCESS:
trace_svcrdma_wc_write(&cc->cc_cid);
- break;
+ return;
case IB_WC_WR_FLUSH_ERR:
trace_svcrdma_wc_write_flush(wc, &cc->cc_cid);
break;
@@ -300,12 +341,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
trace_svcrdma_wc_write_err(wc, &cc->cc_cid);
}
- svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
-
- if (unlikely(wc->status != IB_WC_SUCCESS))
- svc_xprt_deferred_close(&rdma->sc_xprt);
-
- svc_rdma_write_info_free(info);
+ /* The RDMA Write has flushed, so the client won't get
+ * some of the outgoing RPC message. Signal the loss
+ * to the client by closing the connection.
+ */
+ svc_xprt_deferred_close(&rdma->sc_xprt);
}
/**
@@ -384,59 +424,39 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma,
cqe = NULL;
}
- do {
- if (atomic_sub_return(cc->cc_sqecount,
- &rdma->sc_sq_avail) > 0) {
- cc->cc_posttime = ktime_get();
- ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
- if (ret)
- break;
- return 0;
- }
-
- percpu_counter_inc(&svcrdma_stat_sq_starve);
- trace_svcrdma_sq_full(rdma, &cc->cc_cid);
- atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
- wait_event(rdma->sc_send_wait,
- atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
- trace_svcrdma_sq_retry(rdma, &cc->cc_cid);
- } while (1);
-
- trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret);
- svc_xprt_deferred_close(&rdma->sc_xprt);
-
- /* If even one was posted, there will be a completion. */
- if (bad_wr != first_wr)
- return 0;
+ ret = svc_rdma_sq_wait(rdma, &cc->cc_cid, cc->cc_sqecount);
+ if (ret < 0)
+ return ret;
- atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
- wake_up(&rdma->sc_send_wait);
- return -ENOTCONN;
+ cc->cc_posttime = ktime_get();
+ ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+ if (ret)
+ return svc_rdma_post_send_err(rdma, &cc->cc_cid, bad_wr,
+ first_wr, cc->cc_sqecount,
+ ret);
+ return 0;
}
-/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+/* Build a bvec that covers one kvec in an xdr_buf.
*/
-static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
- unsigned int len,
- struct svc_rdma_rw_ctxt *ctxt)
+static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt)
{
- struct scatterlist *sg = ctxt->rw_sg_table.sgl;
-
- sg_set_buf(&sg[0], info->wi_base, len);
+ bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len);
info->wi_base += len;
ctxt->rw_nents = 1;
}
-/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+/* Build a bvec array that covers part of an xdr_buf's pagelist.
*/
-static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
- unsigned int remaining,
- struct svc_rdma_rw_ctxt *ctxt)
+static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info,
+ unsigned int remaining,
+ struct svc_rdma_rw_ctxt *ctxt)
{
- unsigned int sge_no, sge_bytes, page_off, page_no;
+ unsigned int bvec_idx, bvec_len, page_off, page_no;
const struct xdr_buf *xdr = info->wi_xdr;
- struct scatterlist *sg;
struct page **page;
page_off = info->wi_next_off + xdr->page_base;
@@ -444,21 +464,19 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
page_off = offset_in_page(page_off);
page = xdr->pages + page_no;
info->wi_next_off += remaining;
- sg = ctxt->rw_sg_table.sgl;
- sge_no = 0;
+ bvec_idx = 0;
do {
- sge_bytes = min_t(unsigned int, remaining,
- PAGE_SIZE - page_off);
- sg_set_page(sg, *page, sge_bytes, page_off);
-
- remaining -= sge_bytes;
- sg = sg_next(sg);
+ bvec_len = min_t(unsigned int, remaining,
+ PAGE_SIZE - page_off);
+ bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, bvec_len,
+ page_off);
+ remaining -= bvec_len;
page_off = 0;
- sge_no++;
+ bvec_idx++;
page++;
} while (remaining);
- ctxt->rw_nents = sge_no;
+ ctxt->rw_nents = bvec_idx;
}
/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
@@ -496,7 +514,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
constructor(info, write_len, ctxt);
offset = seg->rs_offset + info->wi_seg_off;
ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle,
- DMA_TO_DEVICE);
+ write_len, DMA_TO_DEVICE);
if (ret < 0)
return -EIO;
percpu_counter_inc(&svcrdma_stat_write);
@@ -535,7 +553,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info,
const struct kvec *iov)
{
info->wi_base = iov->iov_base;
- return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+ return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec,
iov->iov_len);
}
@@ -559,7 +577,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
{
info->wi_xdr = xdr;
info->wi_next_off = offset - xdr->head[0].iov_len;
- return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+ return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec,
length);
}
@@ -601,9 +619,37 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
return xdr->len;
}
-static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
- const struct svc_rdma_chunk *chunk,
- const struct xdr_buf *xdr)
+/* Link chunk WRs onto @sctxt's WR chain. Completion is requested
+ * for the tail WR, which is posted first.
+ */
+static void svc_rdma_cc_link_wrs(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *sctxt,
+ struct svc_rdma_chunk_ctxt *cc)
+{
+ struct ib_send_wr *first_wr;
+ struct list_head *pos;
+ struct ib_cqe *cqe;
+
+ first_wr = sctxt->sc_wr_chain;
+ cqe = &cc->cc_cqe;
+ list_for_each(pos, &cc->cc_rwctxts) {
+ struct svc_rdma_rw_ctxt *rwc;
+
+ rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
+ first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, cqe, first_wr);
+ cqe = NULL;
+ }
+ sctxt->sc_wr_chain = first_wr;
+ sctxt->sc_sqecount += cc->cc_sqecount;
+}
+
+/* Link Write WRs for @chunk onto @sctxt's WR chain.
+ */
+static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_chunk *chunk,
+ const struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
struct svc_rdma_chunk_ctxt *cc;
@@ -623,10 +669,14 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
if (ret != payload.len)
goto out_err;
- trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
- ret = svc_rdma_post_chunk_ctxt(rdma, cc);
- if (ret < 0)
+ ret = -EINVAL;
+ if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth))
goto out_err;
+
+ svc_rdma_cc_link_wrs(rdma, sctxt, cc);
+ list_add(&info->wi_list, &sctxt->sc_write_info_list);
+
+ trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
return 0;
out_err:
@@ -635,17 +685,19 @@ out_err:
}
/**
- * svc_rdma_send_write_list - Send all chunks on the Write list
+ * svc_rdma_prepare_write_list - Construct WR chain for sending Write list
* @rdma: controlling RDMA transport
* @rctxt: Write list provisioned by the client
+ * @sctxt: Send WR resources
* @xdr: xdr_buf containing an RPC Reply message
*
- * Returns zero on success, or a negative errno if one or more
- * Write chunks could not be sent.
+ * Returns zero on success, or a negative errno if WR chain
+ * construction fails for one or more Write chunks.
*/
-int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
- const struct svc_rdma_recv_ctxt *rctxt,
- const struct xdr_buf *xdr)
+int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct xdr_buf *xdr)
{
struct svc_rdma_chunk *chunk;
int ret;
@@ -653,7 +705,7 @@ int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
if (!chunk->ch_payload_length)
break;
- ret = svc_rdma_send_write_chunk(rdma, chunk, xdr);
+ ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr);
if (ret < 0)
return ret;
}
@@ -683,9 +735,6 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
{
struct svc_rdma_write_info *info = &sctxt->sc_reply_info;
struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
- struct ib_send_wr *first_wr;
- struct list_head *pos;
- struct ib_cqe *cqe;
int ret;
info->wi_rdma = rdma;
@@ -699,23 +748,222 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
if (ret < 0)
return ret;
- first_wr = sctxt->sc_wr_chain;
- cqe = &cc->cc_cqe;
- list_for_each(pos, &cc->cc_rwctxts) {
- struct svc_rdma_rw_ctxt *rwc;
-
- rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
- first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
- rdma->sc_port_num, cqe, first_wr);
- cqe = NULL;
- }
- sctxt->sc_wr_chain = first_wr;
- sctxt->sc_sqecount += cc->cc_sqecount;
+ svc_rdma_cc_link_wrs(rdma, sctxt, cc);
trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
return xdr->len;
}
+/*
+ * Cap contiguous RDMA Read sink allocations at order-4.
+ * Higher orders risk allocation failure under
+ * __GFP_NORETRY, which would negate the benefit of the
+ * contiguous fast path.
+ */
+#define SVC_RDMA_CONTIG_MAX_ORDER 4
+
+/**
+ * svc_rdma_alloc_read_pages - Allocate physically contiguous pages
+ * @nr_pages: number of pages needed
+ * @order: on success, set to the allocation order
+ *
+ * Attempts a higher-order allocation, falling back to smaller orders.
+ * The returned pages are split immediately so each sub-page has its
+ * own refcount and can be freed independently.
+ *
+ * Returns a pointer to the first page on success, or NULL if even
+ * order-1 allocation fails.
+ */
+static struct page *
+svc_rdma_alloc_read_pages(unsigned int nr_pages, unsigned int *order)
+{
+ unsigned int o;
+ struct page *page;
+
+ o = min(get_order(nr_pages << PAGE_SHIFT),
+ SVC_RDMA_CONTIG_MAX_ORDER);
+
+ while (o >= 1) {
+ page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN,
+ o);
+ if (page) {
+ split_page(page, o);
+ *order = o;
+ return page;
+ }
+ o--;
+ }
+ return NULL;
+}
+
+/*
+ * svc_rdma_fill_contig_bvec - Replace rq_pages with a contiguous allocation
+ * @rqstp: RPC transaction context
+ * @head: context for ongoing I/O
+ * @bv: bvec entry to fill
+ * @pages_left: number of data pages remaining in the segment
+ * @len_left: bytes remaining in the segment
+ *
+ * On success, fills @bv with a bvec spanning the contiguous range and
+ * advances rc_curpage/rc_page_count. Returns the byte length covered,
+ * or zero if the allocation failed or would overrun rq_maxpages.
+ */
+static unsigned int
+svc_rdma_fill_contig_bvec(struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *head,
+ struct bio_vec *bv, unsigned int pages_left,
+ unsigned int len_left)
+{
+ unsigned int order, npages, chunk_pages, chunk_len, i;
+ struct page *page;
+
+ page = svc_rdma_alloc_read_pages(pages_left, &order);
+ if (!page)
+ return 0;
+ npages = 1 << order;
+
+ if (head->rc_curpage + npages > rqstp->rq_maxpages) {
+ for (i = 0; i < npages; i++)
+ __free_page(page + i);
+ return 0;
+ }
+
+ /*
+ * Replace rq_pages[] entries with pages from the contiguous
+ * allocation. If npages exceeds chunk_pages, the extra pages
+ * stay in rq_pages[] for later reuse or normal rqst teardown.
+ */
+ for (i = 0; i < npages; i++) {
+ svc_rqst_page_release(rqstp,
+ rqstp->rq_pages[head->rc_curpage + i]);
+ rqstp->rq_pages[head->rc_curpage + i] = page + i;
+ }
+
+ chunk_pages = min(npages, pages_left);
+ chunk_len = min_t(unsigned int, chunk_pages << PAGE_SHIFT, len_left);
+ bvec_set_page(bv, page, chunk_len, 0);
+ head->rc_page_count += chunk_pages;
+ head->rc_curpage += chunk_pages;
+ return chunk_len;
+}
+
+/*
+ * svc_rdma_fill_page_bvec - Add a single rq_page to the bvec array
+ * @head: context for ongoing I/O
+ * @ctxt: R/W context whose bvec array is being filled
+ * @cur: page to add
+ * @bvec_idx: pointer to current bvec index, not advanced on merge
+ * @len_left: bytes remaining in the segment
+ *
+ * If @cur is physically contiguous with the preceding bvec, it is
+ * merged by extending that bvec's length. Otherwise a new bvec
+ * entry is created. Returns the byte length covered.
+ */
+static unsigned int
+svc_rdma_fill_page_bvec(struct svc_rdma_recv_ctxt *head,
+ struct svc_rdma_rw_ctxt *ctxt, struct page *cur,
+ unsigned int *bvec_idx, unsigned int len_left)
+{
+ unsigned int chunk_len = min_t(unsigned int, PAGE_SIZE, len_left);
+
+ head->rc_page_count++;
+ head->rc_curpage++;
+
+ if (*bvec_idx > 0) {
+ struct bio_vec *prev = &ctxt->rw_bvec[*bvec_idx - 1];
+
+ if (page_to_phys(prev->bv_page) + prev->bv_offset +
+ prev->bv_len == page_to_phys(cur)) {
+ prev->bv_len += chunk_len;
+ return chunk_len;
+ }
+ }
+
+ bvec_set_page(&ctxt->rw_bvec[*bvec_idx], cur, chunk_len, 0);
+ (*bvec_idx)++;
+ return chunk_len;
+}
+
+/**
+ * svc_rdma_build_read_segment_contig - Build RDMA Read WR with contiguous pages
+ * @rqstp: RPC transaction context
+ * @head: context for ongoing I/O
+ * @segment: co-ordinates of remote memory to be read
+ *
+ * Greedily allocates higher-order pages to cover the segment,
+ * building one bvec per contiguous chunk. Each allocation is
+ * split so sub-pages have independent refcounts. When a
+ * higher-order allocation fails, remaining pages are covered
+ * individually, merging adjacent pages into the preceding bvec
+ * when they are physically contiguous. The split sub-pages
+ * replace entries in rq_pages[] so downstream cleanup is
+ * unchanged.
+ *
+ * Returns:
+ * %0: the Read WR was constructed successfully
+ * %-ENOMEM: allocation failed
+ * %-EIO: a DMA mapping error occurred
+ */
+static int svc_rdma_build_read_segment_contig(struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *head,
+ const struct svc_rdma_segment *segment)
+{
+ struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
+ struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
+ unsigned int nr_data_pages, bvec_idx;
+ struct svc_rdma_rw_ctxt *ctxt;
+ unsigned int len_left;
+ int ret;
+
+ nr_data_pages = PAGE_ALIGN(segment->rs_length) >> PAGE_SHIFT;
+ if (head->rc_curpage + nr_data_pages > rqstp->rq_maxpages)
+ return -ENOMEM;
+
+ ctxt = svc_rdma_get_rw_ctxt(rdma, nr_data_pages);
+ if (!ctxt)
+ return -ENOMEM;
+
+ bvec_idx = 0;
+ len_left = segment->rs_length;
+ while (len_left) {
+ unsigned int pages_left = PAGE_ALIGN(len_left) >> PAGE_SHIFT;
+ unsigned int chunk_len = 0;
+
+ if (pages_left >= 2)
+ chunk_len = svc_rdma_fill_contig_bvec(rqstp, head,
+ &ctxt->rw_bvec[bvec_idx],
+ pages_left, len_left);
+ if (chunk_len) {
+ bvec_idx++;
+ } else {
+ struct page *cur =
+ rqstp->rq_pages[head->rc_curpage];
+ chunk_len = svc_rdma_fill_page_bvec(head, ctxt, cur,
+ &bvec_idx,
+ len_left);
+ }
+
+ len_left -= chunk_len;
+ }
+
+ ctxt->rw_nents = bvec_idx;
+
+ head->rc_pageoff = offset_in_page(segment->rs_length);
+ if (head->rc_pageoff)
+ head->rc_curpage--;
+
+ ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset,
+ segment->rs_handle, segment->rs_length,
+ DMA_FROM_DEVICE);
+ if (ret < 0)
+ return -EIO;
+ percpu_counter_inc(&svcrdma_stat_read);
+
+ list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+ cc->cc_sqecount += ret;
+ return 0;
+}
+
/**
* svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
* @rqstp: RPC transaction context
@@ -734,29 +982,37 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
{
struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
- unsigned int sge_no, seg_len, len;
+ unsigned int bvec_idx, nr_bvec, seg_len, len, total;
struct svc_rdma_rw_ctxt *ctxt;
- struct scatterlist *sg;
int ret;
len = segment->rs_length;
- sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT;
- ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no);
+ if (check_add_overflow(head->rc_pageoff, len, &total))
+ return -EINVAL;
+ nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT;
+
+ if (head->rc_pageoff == 0 && nr_bvec >= 2) {
+ ret = svc_rdma_build_read_segment_contig(rqstp, head,
+ segment);
+ if (ret != -ENOMEM)
+ return ret;
+ }
+
+ ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec);
if (!ctxt)
return -ENOMEM;
- ctxt->rw_nents = sge_no;
+ ctxt->rw_nents = nr_bvec;
- sg = ctxt->rw_sg_table.sgl;
- for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
+ for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) {
seg_len = min_t(unsigned int, len,
PAGE_SIZE - head->rc_pageoff);
if (!head->rc_pageoff)
head->rc_page_count++;
- sg_set_page(sg, rqstp->rq_pages[head->rc_curpage],
- seg_len, head->rc_pageoff);
- sg = sg_next(sg);
+ bvec_set_page(&ctxt->rw_bvec[bvec_idx],
+ rqstp->rq_pages[head->rc_curpage],
+ seg_len, head->rc_pageoff);
head->rc_pageoff += seg_len;
if (head->rc_pageoff == PAGE_SIZE) {
@@ -765,12 +1021,13 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
}
len -= seg_len;
- if (len && ((head->rc_curpage + 1) > ARRAY_SIZE(rqstp->rq_pages)))
+ if (len && ((head->rc_curpage + 1) > rqstp->rq_maxpages))
goto out_overrun;
}
ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset,
- segment->rs_handle, DMA_FROM_DEVICE);
+ segment->rs_handle, segment->rs_length,
+ DMA_FROM_DEVICE);
if (ret < 0)
return -EIO;
percpu_counter_inc(&svcrdma_stat_read);
@@ -841,6 +1098,9 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp,
for (page_no = 0; page_no < numpages; page_no++) {
unsigned int page_len;
+ if (head->rc_curpage >= rqstp->rq_maxpages)
+ return -EINVAL;
+
page_len = min_t(unsigned int, remaining,
PAGE_SIZE - head->rc_pageoff);
@@ -848,7 +1108,7 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp,
head->rc_page_count++;
dst = page_address(rqstp->rq_pages[head->rc_curpage]);
- memcpy(dst + head->rc_curpage, src + offset, page_len);
+ memcpy((unsigned char *)dst + head->rc_pageoff, src + offset, page_len);
head->rc_readbytes += page_len;
head->rc_pageoff += page_len;
@@ -860,7 +1120,7 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp,
offset += page_len;
}
- return -EINVAL;
+ return 0;
}
/**
@@ -1083,10 +1343,16 @@ static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp,
{
unsigned int i;
+ /*
+ * Move only pages containing RPC data into rc_pages[]. Pages
+ * from a contiguous allocation that were not used for the
+ * payload remain in rq_pages[] for subsequent reuse.
+ */
for (i = 0; i < head->rc_page_count; i++) {
head->rc_pages[i] = rqstp->rq_pages[i];
rqstp->rq_pages[i] = NULL;
}
+ rqstp->rq_pages_nfree = head->rc_page_count;
}
/**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 96154a2367a1..8b3f0c8c14b2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -116,8 +116,10 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc);
static struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
{
- int node = ibdev_to_node(rdma->sc_cm_id->device);
+ struct ib_device *device = rdma->sc_cm_id->device;
+ int node = ibdev_to_node(device);
struct svc_rdma_send_ctxt *ctxt;
+ unsigned long pages;
dma_addr_t addr;
void *buffer;
int i;
@@ -126,13 +128,19 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
GFP_KERNEL, node);
if (!ctxt)
goto fail0;
+ pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server);
+ ctxt->sc_pages = kcalloc_node(pages, sizeof(struct page *),
+ GFP_KERNEL, node);
+ if (!ctxt->sc_pages)
+ goto fail1;
+ ctxt->sc_maxpages = pages;
buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
- goto fail1;
- addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
- rdma->sc_max_req_size, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
goto fail2;
+ addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(device, addr))
+ goto fail3;
svc_rdma_send_cid_init(rdma, &ctxt->sc_cid);
@@ -142,6 +150,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
ctxt->sc_cqe.done = svc_rdma_wc_send;
+ INIT_LIST_HEAD(&ctxt->sc_write_info_list);
ctxt->sc_xprt_buf = buffer;
xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
rdma->sc_max_req_size);
@@ -151,8 +160,10 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey;
return ctxt;
-fail2:
+fail3:
kfree(buffer);
+fail2:
+ kfree(ctxt->sc_pages);
fail1:
kfree(ctxt);
fail0:
@@ -166,16 +177,16 @@ fail0:
*/
void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma)
{
+ struct ib_device *device = rdma->sc_cm_id->device;
struct svc_rdma_send_ctxt *ctxt;
struct llist_node *node;
while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) {
ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node);
- ib_dma_unmap_single(rdma->sc_pd->device,
- ctxt->sc_sges[0].addr,
- rdma->sc_max_req_size,
- DMA_TO_DEVICE);
+ ib_dma_unmap_single(device, ctxt->sc_sges[0].addr,
+ rdma->sc_max_req_size, DMA_TO_DEVICE);
kfree(ctxt->sc_xprt_buf);
+ kfree(ctxt->sc_pages);
kfree(ctxt);
}
}
@@ -227,6 +238,7 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
struct ib_device *device = rdma->sc_cm_id->device;
unsigned int i;
+ svc_rdma_write_chunk_release(rdma, ctxt);
svc_rdma_reply_chunk_release(rdma, ctxt);
if (ctxt->sc_page_count)
@@ -285,6 +297,117 @@ void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail)
}
/**
+ * svc_rdma_sq_wait - Wait for SQ slots using fair queuing
+ * @rdma: controlling transport
+ * @cid: completion ID for tracing
+ * @sqecount: number of SQ entries needed
+ *
+ * A ticket-based system ensures fair ordering when multiple threads
+ * wait for Send Queue capacity. Each waiter takes a ticket and is
+ * served in order, preventing starvation.
+ *
+ * Protocol invariant: every ticket holder must increment
+ * sc_sq_ticket_tail exactly once, whether the reservation
+ * succeeds or the connection closes. Failing to advance the
+ * tail stalls all subsequent waiters.
+ *
+ * The ticket counters are signed 32-bit atomics. After
+ * wrapping through INT_MAX, the equality check
+ * (tail == ticket) remains correct because both counters
+ * advance monotonically and the comparison uses exact
+ * equality rather than relational operators.
+ *
+ * Return values:
+ * %0: SQ slots were reserved successfully
+ * %-ENOTCONN: The connection was lost
+ */
+int svc_rdma_sq_wait(struct svcxprt_rdma *rdma,
+ const struct rpc_rdma_cid *cid, int sqecount)
+{
+ int ticket;
+
+ /* Fast path: try to reserve SQ slots without waiting.
+ *
+ * A failed reservation temporarily understates sc_sq_avail
+ * until the compensating atomic_add restores it. A Send
+ * completion arriving in that window sees a lower count
+ * than reality, but the value self-corrects once the add
+ * completes. No ordering guarantee is needed here because
+ * the slow path serializes all contended waiters.
+ */
+ if (likely(atomic_sub_return(sqecount, &rdma->sc_sq_avail) >= 0))
+ return 0;
+ atomic_add(sqecount, &rdma->sc_sq_avail);
+
+ /* Slow path: take a ticket and wait in line */
+ ticket = atomic_fetch_inc(&rdma->sc_sq_ticket_head);
+
+ percpu_counter_inc(&svcrdma_stat_sq_starve);
+ trace_svcrdma_sq_full(rdma, cid);
+
+ /* Wait until all earlier tickets have been served */
+ wait_event(rdma->sc_sq_ticket_wait,
+ test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) ||
+ atomic_read(&rdma->sc_sq_ticket_tail) == ticket);
+ if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+ goto out_close;
+
+ /* It's our turn. Wait for enough SQ slots to be available. */
+ while (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) {
+ atomic_add(sqecount, &rdma->sc_sq_avail);
+
+ wait_event(rdma->sc_send_wait,
+ test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) ||
+ atomic_read(&rdma->sc_sq_avail) >= sqecount);
+ if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+ goto out_close;
+ }
+
+ /* Slots reserved successfully. Let the next waiter proceed. */
+ atomic_inc(&rdma->sc_sq_ticket_tail);
+ wake_up(&rdma->sc_sq_ticket_wait);
+ trace_svcrdma_sq_retry(rdma, cid);
+ return 0;
+
+out_close:
+ atomic_inc(&rdma->sc_sq_ticket_tail);
+ wake_up(&rdma->sc_sq_ticket_wait);
+ return -ENOTCONN;
+}
+
+/**
+ * svc_rdma_post_send_err - Handle ib_post_send failure
+ * @rdma: controlling transport
+ * @cid: completion ID for tracing
+ * @bad_wr: first WR that was not posted
+ * @first_wr: first WR in the chain
+ * @sqecount: number of SQ entries that were reserved
+ * @ret: error code from ib_post_send
+ *
+ * Return values:
+ * %0: At least one WR was posted; a completion handles cleanup
+ * %-ENOTCONN: No WRs were posted; SQ slots are released
+ */
+int svc_rdma_post_send_err(struct svcxprt_rdma *rdma,
+ const struct rpc_rdma_cid *cid,
+ const struct ib_send_wr *bad_wr,
+ const struct ib_send_wr *first_wr,
+ int sqecount, int ret)
+{
+ trace_svcrdma_sq_post_err(rdma, cid, ret);
+ svc_xprt_deferred_close(&rdma->sc_xprt);
+
+ /* If even one WR was posted, a Send completion will
+ * return the reserved SQ slots.
+ */
+ if (bad_wr != first_wr)
+ return 0;
+
+ svc_rdma_wake_send_waiters(rdma, sqecount);
+ return -ENOTCONN;
+}
+
+/**
* svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
* @cq: Completion Queue context
* @wc: Work Completion object
@@ -326,11 +449,6 @@ flushed:
* that these values remain available after the ib_post_send() call.
* In some error flow cases, svc_rdma_wc_send() releases @ctxt.
*
- * Note there is potential for starvation when the Send Queue is
- * full because there is no order to when waiting threads are
- * awoken. The transport is typically provisioned with a deep
- * enough Send Queue that SQ exhaustion should be a rare event.
- *
* Return values:
* %0: @ctxt's WR chain was posted successfully
* %-ENOTCONN: The connection was lost
@@ -347,47 +465,21 @@ int svc_rdma_post_send(struct svcxprt_rdma *rdma,
might_sleep();
/* Sync the transport header buffer */
- ib_dma_sync_single_for_device(rdma->sc_pd->device,
+ ib_dma_sync_single_for_device(rdma->sc_cm_id->device,
send_wr->sg_list[0].addr,
send_wr->sg_list[0].length,
DMA_TO_DEVICE);
- /* If the SQ is full, wait until an SQ entry is available */
- while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) {
- if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) {
- svc_rdma_wake_send_waiters(rdma, sqecount);
-
- /* When the transport is torn down, assume
- * ib_drain_sq() will trigger enough Send
- * completions to wake us. The XPT_CLOSE test
- * above should then cause the while loop to
- * exit.
- */
- percpu_counter_inc(&svcrdma_stat_sq_starve);
- trace_svcrdma_sq_full(rdma, &cid);
- wait_event(rdma->sc_send_wait,
- atomic_read(&rdma->sc_sq_avail) > 0);
- trace_svcrdma_sq_retry(rdma, &cid);
- continue;
- }
-
- trace_svcrdma_post_send(ctxt);
- ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
- if (ret) {
- trace_svcrdma_sq_post_err(rdma, &cid, ret);
- svc_xprt_deferred_close(&rdma->sc_xprt);
-
- /* If even one WR was posted, there will be a
- * Send completion that bumps sc_sq_avail.
- */
- if (bad_wr == first_wr) {
- svc_rdma_wake_send_waiters(rdma, sqecount);
- break;
- }
- }
- return 0;
- }
- return -ENOTCONN;
+ ret = svc_rdma_sq_wait(rdma, &cid, sqecount);
+ if (ret < 0)
+ return ret;
+
+ trace_svcrdma_post_send(ctxt);
+ ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+ if (ret)
+ return svc_rdma_post_send_err(rdma, &cid, bad_wr,
+ first_wr, sqecount, ret);
+ return 0;
}
/**
@@ -848,7 +940,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
/* The svc_rqst and all resources it owns are released as soon as
* svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
- * so they are released by the Send completion handler.
+ * so they are released only after Send completion, and not by
+ * svc_rqst_release_pages().
*/
static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
struct svc_rdma_send_ctxt *ctxt)
@@ -860,9 +953,6 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
ctxt->sc_pages[i] = rqstp->rq_respages[i];
rqstp->rq_respages[i] = NULL;
}
-
- /* Prevent svc_xprt_release from releasing pages in rq_pages */
- rqstp->rq_next_page = rqstp->rq_respages;
}
/* Prepare the portion of the RPC Reply that will be transmitted
@@ -966,6 +1056,12 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
sctxt->sc_send_wr.num_sge = 1;
sctxt->sc_send_wr.opcode = IB_WR_SEND;
sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
+
+ /* Ensure only the error message is posted, not any previously
+ * prepared Write chunk WRs.
+ */
+ sctxt->sc_wr_chain = &sctxt->sc_send_wr;
+ sctxt->sc_sqecount = 1;
if (svc_rdma_post_send(rdma, sctxt))
goto put_ctxt;
return;
@@ -1013,7 +1109,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (!p)
goto put_ctxt;
- ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res);
+ ret = svc_rdma_prepare_write_list(rdma, rctxt, sctxt, &rqstp->rq_res);
if (ret < 0)
goto put_ctxt;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index c3fbf0779d4a..f18bc60d9f4f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -179,6 +179,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
init_llist_head(&cma_xprt->sc_recv_ctxts);
init_llist_head(&cma_xprt->sc_rw_ctxts);
init_waitqueue_head(&cma_xprt->sc_send_wait);
+ init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
@@ -406,15 +407,14 @@ static void svc_rdma_xprt_done(struct rpcrdma_notification *rn)
*/
static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
{
+ unsigned int ctxts, rq_depth, maxpayload;
struct svcxprt_rdma *listen_rdma;
struct svcxprt_rdma *newxprt = NULL;
struct rdma_conn_param conn_param;
struct rpcrdma_connect_private pmsg;
struct ib_qp_init_attr qp_attr;
- unsigned int ctxts, rq_depth;
struct ib_device *dev;
int ret = 0;
- RPC_IFDEBUG(struct sockaddr *sap);
listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
clear_bit(XPT_CONN, &xprt->xpt_flags);
@@ -462,16 +462,24 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_bc_requests = 2;
}
- /* Arbitrarily estimate the number of rw_ctxs needed for
- * this transport. This is enough rw_ctxs to make forward
- * progress even if the client is using one rkey per page
- * in each Read chunk.
+ /* Estimate the needed number of rdma_rw contexts. The maximum
+ * Read and Write chunks have one segment each. Each request
+ * can involve one Read chunk and either a Write chunk or Reply
+ * chunk; thus a factor of three.
*/
- ctxts = 3 * RPCSVC_MAXPAGES;
- newxprt->sc_sq_depth = rq_depth + ctxts;
+ maxpayload = min(xprt->xpt_server->sv_max_payload,
+ RPCSVC_MAXPAYLOAD_RDMA);
+ ctxts = newxprt->sc_max_requests * 3 *
+ rdma_rw_mr_factor(dev, newxprt->sc_port_num,
+ maxpayload >> PAGE_SHIFT);
+
+ newxprt->sc_sq_depth = rq_depth +
+ rdma_rw_max_send_wr(dev, newxprt->sc_port_num, ctxts, 0);
if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
+ atomic_set(&newxprt->sc_sq_ticket_head, 0);
+ atomic_set(&newxprt->sc_sq_ticket_tail, 0);
newxprt->sc_pd = ib_alloc_pd(dev, 0);
if (IS_ERR(newxprt->sc_pd)) {
@@ -554,18 +562,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
goto errout;
}
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- dprintk("svcrdma: new connection accepted on device %s:\n", dev->name);
- sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
- dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap));
- sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
- dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap));
- dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges);
- dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth);
- dprintk(" rdma_rw_ctxs : %d\n", ctxts);
- dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
- dprintk(" ord : %d\n", conn_param.initiator_depth);
-#endif
+ if (IS_ENABLED(CONFIG_SUNRPC_DEBUG)) {
+ struct sockaddr *sap;
+
+ dprintk("svcrdma: new connection accepted on device %s:\n", dev->name);
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+ dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap));
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+ dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap));
+ dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges);
+ dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth);
+ dprintk(" rdma_rw_ctxs : %d\n", ctxts);
+ dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
+ dprintk(" ord : %d\n", conn_param.initiator_depth);
+ }
return &newxprt->sc_xprt;
@@ -575,6 +585,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
ib_destroy_qp(newxprt->sc_qp);
rdma_destroy_id(newxprt->sc_cm_id);
+ rpcrdma_rn_unregister(dev, &newxprt->sc_rn);
/* This call to put will destroy the transport */
svc_xprt_put(&newxprt->sc_xprt);
return NULL;
@@ -588,12 +599,18 @@ static void svc_rdma_detach(struct svc_xprt *xprt)
rdma_disconnect(rdma->sc_cm_id);
}
-static void __svc_rdma_free(struct work_struct *work)
+/**
+ * svc_rdma_free - Release class-specific transport resources
+ * @xprt: Generic svc transport object
+ */
+static void svc_rdma_free(struct svc_xprt *xprt)
{
struct svcxprt_rdma *rdma =
- container_of(work, struct svcxprt_rdma, sc_work);
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct ib_device *device = rdma->sc_cm_id->device;
+ might_sleep();
+
/* This blocks until the Completion Queues are empty */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_drain_qp(rdma->sc_qp);
@@ -621,19 +638,11 @@ static void __svc_rdma_free(struct work_struct *work)
/* Destroy the CM ID */
rdma_destroy_id(rdma->sc_cm_id);
- rpcrdma_rn_unregister(device, &rdma->sc_rn);
+ if (!test_bit(XPT_LISTENER, &rdma->sc_xprt.xpt_flags))
+ rpcrdma_rn_unregister(device, &rdma->sc_rn);
kfree(rdma);
}
-static void svc_rdma_free(struct svc_xprt *xprt)
-{
- struct svcxprt_rdma *rdma =
- container_of(xprt, struct svcxprt_rdma, sc_xprt);
-
- INIT_WORK(&rdma->sc_work, __svc_rdma_free);
- schedule_work(&rdma->sc_work);
-}
-
static int svc_rdma_has_wspace(struct svc_xprt *xprt)
{
struct svcxprt_rdma *rdma =
@@ -643,7 +652,8 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
* If there are already waiters on the SQ,
* return false.
*/
- if (waitqueue_active(&rdma->sc_send_wait))
+ if (waitqueue_active(&rdma->sc_send_wait) ||
+ waitqueue_active(&rdma->sc_sq_ticket_wait))
return 0;
/* Otherwise return true. */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 9a8ce5df83ca..61706df5e485 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -510,8 +510,21 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
return;
out_sleep:
- task->tk_status = -ENOMEM;
- xprt_add_backlog(xprt, task);
+ task->tk_status = -EAGAIN;
+ xprt_add_backlog_noncongested(xprt, task);
+ /* A buffer freed between buffer_get and rpc_sleep_on
+ * goes back to the pool with no waiter to wake.
+ * Re-check after joining the backlog to close that gap.
+ */
+ req = rpcrdma_buffer_get(&r_xprt->rx_buf);
+ if (req) {
+ struct rpc_rqst *rqst = &req->rl_slot;
+
+ if (!xprt_wake_up_backlog(xprt, rqst)) {
+ memset(rqst, 0, sizeof(*rqst));
+ rpcrdma_buffer_put(&r_xprt->rx_buf, req);
+ }
+ }
}
/**
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 63262ef0c2e3..aecf9c0a153f 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -383,7 +383,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_ep *ep;
int rc;
- ep = kzalloc(sizeof(*ep), XPRTRDMA_GFP_FLAGS);
+ ep = kzalloc_obj(*ep, XPRTRDMA_GFP_FLAGS);
if (!ep)
return -ENOTCONN;
ep->re_xprt = &r_xprt->rx_xprt;
@@ -615,8 +615,8 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
{
struct rpcrdma_sendctx *sc;
- sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge),
- XPRTRDMA_GFP_FLAGS);
+ sc = kzalloc_flex(*sc, sc_sges, ep->re_attr.cap.max_send_sge,
+ XPRTRDMA_GFP_FLAGS);
if (!sc)
return NULL;
@@ -639,7 +639,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
* Sends are posted.
*/
i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS;
- buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), XPRTRDMA_GFP_FLAGS);
+ buf->rb_sc_ctxs = kzalloc_objs(sc, i, XPRTRDMA_GFP_FLAGS);
if (!buf->rb_sc_ctxs)
return -ENOMEM;
@@ -708,6 +708,18 @@ out_emptyq:
*/
xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
r_xprt->rx_stats.empty_sendctx_q++;
+
+ /* Recheck: a Send completion between the ring-empty test
+ * and the set_bit could cause its xprt_write_space() to
+ * miss, leaving XPRT_WRITE_SPACE set with a non-full ring.
+ * The smp_mb__after_atomic() pairs with smp_store_release()
+ * in rpcrdma_sendctx_put_locked().
+ */
+ smp_mb__after_atomic();
+ next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head);
+ if (next_head != READ_ONCE(buf->rb_sc_tail))
+ xprt_write_space(&r_xprt->rx_xprt);
+
return NULL;
}
@@ -739,7 +751,10 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
} while (buf->rb_sc_ctxs[next_tail] != sc);
- /* Paired with READ_ONCE */
+ /* Paired with READ_ONCE in rpcrdma_sendctx_get_locked():
+ * both the fast-path ring-full test and the post-set_bit
+ * recheck in the slow path depend on this store-release.
+ */
smp_store_release(&buf->rb_sc_tail, next_tail);
xprt_write_space(&r_xprt->rx_xprt);
@@ -822,7 +837,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
struct rpcrdma_req *req;
- req = kzalloc(sizeof(*req), XPRTRDMA_GFP_FLAGS);
+ req = kzalloc_obj(*req, XPRTRDMA_GFP_FLAGS);
if (req == NULL)
goto out1;
@@ -952,7 +967,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt)
struct ib_device *device = ep->re_id->device;
struct rpcrdma_rep *rep;
- rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS);
+ rep = kzalloc_obj(*rep, XPRTRDMA_GFP_FLAGS);
if (rep == NULL)
goto out;
@@ -1359,10 +1374,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed)
if (likely(ep->re_receive_count > needed))
goto out;
needed -= ep->re_receive_count;
- needed += RPCRDMA_MAX_RECV_BATCH;
+ needed += ep->re_recv_batch;
if (atomic_inc_return(&ep->re_receiving) > 1)
- goto out;
+ goto out_dec;
/* fast path: all needed reps can be found on the free list */
wr = NULL;
@@ -1385,7 +1400,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed)
++count;
}
if (!wr)
- goto out;
+ goto out_dec;
rc = ib_post_recv(ep->re_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
@@ -1400,9 +1415,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed)
--count;
}
}
+
+out_dec:
if (atomic_dec_return(&ep->re_receiving) > 0)
complete(&ep->re_done);
-
out:
trace_xprtrdma_post_recvs(r_xprt, count);
ep->re_receive_count += count;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 8147d2b41494..f53a77472724 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -96,6 +96,7 @@ struct rpcrdma_ep {
struct rpcrdma_notification re_rn;
int re_receive_count;
unsigned int re_max_requests; /* depends on device */
+ unsigned int re_recv_batch;
unsigned int re_inline_send; /* negotiated */
unsigned int re_inline_recv; /* negotiated */
@@ -283,19 +284,36 @@ struct rpcrdma_mr {
* registered or invalidated. Must handle a Reply chunk:
*/
enum {
- RPCRDMA_MAX_IOV_SEGS = 3,
+ RPCRDMA_MAX_IOV_SEGS = 3, /* head, page-boundary, tail */
RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS +
RPCRDMA_MAX_IOV_SEGS,
};
-/* Arguments for DMA mapping and registration */
-struct rpcrdma_mr_seg {
- u32 mr_len; /* length of segment */
- struct page *mr_page; /* underlying struct page */
- u64 mr_offset; /* IN: page offset, OUT: iova */
+/**
+ * struct rpcrdma_xdr_cursor - tracks position within an xdr_buf
+ * for iterative MR registration
+ * @xc_buf: the xdr_buf being iterated
+ * @xc_page_offset: byte offset into the page region consumed so far
+ * @xc_flags: combination of XC_* bits
+ *
+ * Each XC_*_DONE flag indicates that this region has no
+ * remaining MR registration work. That condition holds both when the region
+ * has already been registered by a prior frwr_map() call and
+ * when the region is excluded from this chunk type (pre-set
+ * at init time by rpcrdma_xdr_cursor_init()). frwr_map()
+ * treats the two cases identically: skip the region.
+ */
+struct rpcrdma_xdr_cursor {
+ const struct xdr_buf *xc_buf;
+ unsigned int xc_page_offset;
+ unsigned int xc_flags;
};
+#define XC_HEAD_DONE BIT(0)
+#define XC_PAGES_DONE BIT(1)
+#define XC_TAIL_DONE BIT(2)
+
/* The Send SGE array is provisioned to send a maximum size
* inline request:
* - RPC-over-RDMA header
@@ -330,7 +348,6 @@ struct rpcrdma_req {
struct list_head rl_free_mrs;
struct list_head rl_registered;
- struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
static inline struct rpcrdma_req *
@@ -450,8 +467,8 @@ rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt)
}
/* Setting this to 0 ensures interoperability with early servers.
- * Setting this to 1 enhances certain unaligned read/write performance.
- * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+ * Setting this to 1 enhances unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c */
extern int xprt_rdma_pad_optimize;
/* This setting controls the hunt for a supported memory
@@ -535,10 +552,10 @@ void frwr_reset(struct rpcrdma_req *req);
int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
void frwr_mr_release(struct rpcrdma_mr *mr);
-struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing, __be32 xid,
- struct rpcrdma_mr *mr);
+int frwr_map(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_xdr_cursor *cur,
+ bool writing, __be32 xid,
+ struct rpcrdma_mr *mr);
int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 83cc095846d3..2e1fe6013361 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -358,7 +358,7 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp)
static int
xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
- struct cmsghdr *cmsg, int ret)
+ unsigned int *msg_flags, struct cmsghdr *cmsg, int ret)
{
u8 content_type = tls_get_record_type(sock->sk, cmsg);
u8 level, description;
@@ -371,7 +371,7 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
* record, even though there might be more frames
* waiting to be decrypted.
*/
- msg->msg_flags &= ~MSG_EOR;
+ *msg_flags &= ~MSG_EOR;
break;
case TLS_RECORD_TYPE_ALERT:
tls_alert_recv(sock->sk, msg, &level, &description);
@@ -386,19 +386,33 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
}
static int
-xs_sock_recv_cmsg(struct socket *sock, struct msghdr *msg, int flags)
+xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags)
{
union {
struct cmsghdr cmsg;
u8 buf[CMSG_SPACE(sizeof(u8))];
} u;
+ u8 alert[2];
+ struct kvec alert_kvec = {
+ .iov_base = alert,
+ .iov_len = sizeof(alert),
+ };
+ struct msghdr msg = {
+ .msg_flags = *msg_flags,
+ .msg_control = &u,
+ .msg_controllen = sizeof(u),
+ };
int ret;
- msg->msg_control = &u;
- msg->msg_controllen = sizeof(u);
- ret = sock_recvmsg(sock, msg, flags);
- if (msg->msg_controllen != sizeof(u))
- ret = xs_sock_process_cmsg(sock, msg, &u.cmsg, ret);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
+ alert_kvec.iov_len);
+ ret = sock_recvmsg(sock, &msg, flags);
+ if (ret > 0) {
+ if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT)
+ iov_iter_revert(&msg.msg_iter, ret);
+ ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg,
+ -EAGAIN);
+ }
return ret;
}
@@ -408,7 +422,13 @@ xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek)
ssize_t ret;
if (seek != 0)
iov_iter_advance(&msg->msg_iter, seek);
- ret = xs_sock_recv_cmsg(sock, msg, flags);
+ ret = sock_recvmsg(sock, msg, flags);
+ /* Handle TLS inband control message lazily */
+ if (msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR);
+ if (ret == 0 || ret == -EIO)
+ ret = xs_sock_recv_cmsg(sock, &msg->msg_flags, flags);
+ }
return ret > 0 ? ret + seek : ret;
}
@@ -434,7 +454,7 @@ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags,
size_t count)
{
iov_iter_discard(&msg->msg_iter, ITER_DEST, count);
- return xs_sock_recv_cmsg(sock, msg, flags);
+ return xs_sock_recvmsg(sock, msg, flags, 0);
}
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
@@ -1825,8 +1845,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
do {
rpc_set_port((struct sockaddr *)&myaddr, port);
- err = kernel_bind(sock, (struct sockaddr *)&myaddr,
- transport->xprt.addrlen);
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&myaddr,
+ transport->xprt.addrlen);
if (err == 0) {
if (transport->xprt.reuseport)
transport->srcport = port;
@@ -1985,7 +2005,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
xs_stream_start_connect(transport);
- return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0);
+ return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), xprt->addrlen, 0);
}
/**
@@ -2385,7 +2405,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
/* Tell the socket layer to start connecting... */
set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
- return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
+ return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt),
+ xprt->addrlen, O_NONBLOCK);
}
/**
@@ -2726,20 +2747,14 @@ static void xs_tcp_tls_setup_socket(struct work_struct *work)
if (status)
goto out_close;
xprt_release_write(lower_xprt, NULL);
-
trace_rpc_socket_connect(upper_xprt, upper_transport->sock, 0);
- if (!xprt_test_and_set_connected(upper_xprt)) {
- upper_xprt->connect_cookie++;
- clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state);
- xprt_clear_connecting(upper_xprt);
-
- upper_xprt->stat.connect_count++;
- upper_xprt->stat.connect_time += (long)jiffies -
- upper_xprt->stat.connect_start;
- xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING);
- }
rpc_shutdown_client(lower_clnt);
+ /* Check for ingress data that arrived before the socket's
+ * ->data_ready callback was set up.
+ */
+ xs_poll_check_readable(upper_transport);
+
out_unlock:
current_restore_flags(pflags, PF_MEMALLOC);
upper_transport->clnt = NULL;