diff options
Diffstat (limited to 'net/sunrpc')
41 files changed, 2278 insertions, 1834 deletions
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 2d8b67dac7b5..a570e7adf270 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -101,6 +101,20 @@ config SUNRPC_DEBUG If unsure, say Y. +config SUNRPC_DEBUG_TRACE + bool "RPC: Send dfprintk() output to the trace buffer" + depends on SUNRPC_DEBUG && TRACING + default n + help + dprintk() output can be voluminous, which can overwhelm the + kernel's logging facility as it must be sent to the console. + This option causes dprintk() output to go to the trace buffer + instead of the kernel log. + + This will cause warnings about trace_printk() being used to be + logged at boot time, so say N unless you are debugging a problem + with sunrpc-based clients or services. + config SUNRPC_XPRT_RDMA tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 04534ea537c8..68c0595ea2fd 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -290,12 +290,12 @@ rpcauth_init_credcache(struct rpc_auth *auth) struct rpc_cred_cache *new; unsigned int hashsize; - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc_obj(*new); if (!new) goto out_nocache; new->hashbits = auth_hashbits; hashsize = 1U << new->hashbits; - new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL); + new->hashtable = kzalloc_objs(new->hashtable[0], hashsize); if (!new->hashtable) goto out_nohashtbl; spin_lock_init(&new->lock); @@ -489,7 +489,7 @@ static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - return number_cred_unused * sysctl_vfs_cache_pressure / 100; + return number_cred_unused; } static void diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 369310909fc9..9d3fb6848f40 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -39,6 +39,8 @@ static const struct rpc_authops authgss_ops; static const struct rpc_credops gss_credops; static const struct rpc_credops gss_nullops; +static void gss_free_callback(struct kref *kref); + #define GSS_RETRY_EXPIRED 5 static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; @@ -162,7 +164,7 @@ gss_alloc_context(void) { struct gss_cl_ctx *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc_obj(*ctx); if (ctx != NULL) { ctx->gc_proc = RPC_GSS_PROC_DATA; ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ @@ -527,7 +529,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, int vers; int err = -ENOMEM; - gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL); + gss_msg = kzalloc_obj(*gss_msg); if (gss_msg == NULL) goto err; vers = get_pipe_version(gss_auth->net); @@ -551,6 +553,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, } return gss_msg; err_put_pipe_version: + kref_put(&gss_auth->kref, gss_free_callback); put_pipe_version(gss_auth->net); err_free_msg: kfree(gss_msg); @@ -887,25 +890,16 @@ static void gss_pipe_dentry_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *gss_pipe = pdo->pdo_data; - struct rpc_pipe *pipe = gss_pipe->pipe; - if (pipe->dentry != NULL) { - rpc_unlink(pipe->dentry); - pipe->dentry = NULL; - } + rpc_unlink(gss_pipe->pipe); } static int gss_pipe_dentry_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *p = pdo->pdo_data; - struct dentry *dentry; - dentry = rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - p->pipe->dentry = dentry; - return 0; + return rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); } static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = { @@ -920,7 +914,7 @@ static struct gss_pipe *gss_pipe_alloc(struct rpc_clnt *clnt, struct gss_pipe *p; int err = -ENOMEM; - p = kmalloc(sizeof(*p), GFP_KERNEL); + p = kmalloc_obj(*p); if (p == NULL) goto err; p->pipe = rpc_mkpipe_data(upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); @@ -1035,7 +1029,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) if (!try_module_get(THIS_MODULE)) return ERR_PTR(err); - if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) + if (!(gss_auth = kmalloc_obj(*gss_auth))) goto out_dec; INIT_HLIST_NODE(&gss_auth->hash); gss_auth->target_name = NULL; @@ -1252,7 +1246,7 @@ gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) struct gss_cred *new; /* Make a copy of the cred so that we can reference count it */ - new = kzalloc(sizeof(*gss_cred), GFP_KERNEL); + new = kzalloc_obj(*gss_cred); if (new) { struct auth_cred acred = { .cred = gss_cred->gc_base.cr_cred, @@ -1386,7 +1380,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t struct gss_cred *cred = NULL; int err = -ENOMEM; - if (!(cred = kzalloc(sizeof(*cred), gfp))) + if (!(cred = kzalloc_obj(*cred, gfp))) goto out_err; rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); @@ -1545,6 +1539,7 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) struct kvec iov; struct xdr_buf verf_buf; int status; + u32 seqno; /* Credential */ @@ -1556,15 +1551,16 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) cred_len = p++; spin_lock(&ctx->gc_seq_lock); - req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + xprt_rqst_add_seqno(req, seqno); spin_unlock(&ctx->gc_seq_lock); - if (req->rq_seqno == MAXSEQ) + if (*req->rq_seqnos == MAXSEQ) goto expired; trace_rpcgss_seqno(task); *p++ = cpu_to_be32(RPC_GSS_VERSION); *p++ = cpu_to_be32(ctx->gc_proc); - *p++ = cpu_to_be32(req->rq_seqno); + *p++ = cpu_to_be32(*req->rq_seqnos); *p++ = cpu_to_be32(gss_cred->gc_service); p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2); @@ -1678,17 +1674,31 @@ gss_refresh_null(struct rpc_task *task) return 0; } +static u32 +gss_validate_seqno_mic(struct gss_cl_ctx *ctx, u32 seqno, __be32 *seq, __be32 *p, u32 len) +{ + struct kvec iov; + struct xdr_buf verf_buf; + struct xdr_netobj mic; + + *seq = cpu_to_be32(seqno); + iov.iov_base = seq; + iov.iov_len = 4; + xdr_buf_from_iov(&iov, &verf_buf); + mic.data = (u8 *)p; + mic.len = len; + return gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); +} + static int gss_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *seq = NULL; - struct kvec iov; - struct xdr_buf verf_buf; - struct xdr_netobj mic; u32 len, maj_stat; int status; + int i = 1; /* don't recheck the first item */ p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) @@ -1705,13 +1715,10 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr) seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; - *seq = cpu_to_be32(task->tk_rqstp->rq_seqno); - iov.iov_base = seq; - iov.iov_len = 4; - xdr_buf_from_iov(&iov, &verf_buf); - mic.data = (u8 *)p; - mic.len = len; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[0], seq, p, len); + /* RFC 2203 5.3.3.1 - compute the checksum of each sequence number in the cache */ + while (unlikely(maj_stat == GSS_S_BAD_SIG && i < task->tk_rqstp->rq_seqno_count)) + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[i++], seq, p, len); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) @@ -1750,7 +1757,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; integ_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -1810,9 +1817,7 @@ alloc_enc_pages(struct rpc_rqst *rqstp) last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT; rqstp->rq_enc_pages_num = last - first + 1 + 1; rqstp->rq_enc_pages - = kmalloc_array(rqstp->rq_enc_pages_num, - sizeof(struct page *), - GFP_KERNEL); + = kmalloc_objs(struct page *, rqstp->rq_enc_pages_num); if (!rqstp->rq_enc_pages) goto out; for (i=0; i < rqstp->rq_enc_pages_num; i++) { @@ -1847,7 +1852,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; opaque_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -2001,7 +2006,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, offset = rcv_buf->len - xdr_stream_remaining(xdr); if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; - if (seqno != rqstp->rq_seqno) + if (seqno != *rqstp->rq_seqnos) goto bad_seqno; if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) goto unwrap_failed; @@ -2045,7 +2050,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); goto out; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, seqno); goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); @@ -2077,7 +2082,7 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; /* gss_unwrap decrypted the sequence number */ - if (be32_to_cpup(p++) != rqstp->rq_seqno) + if (be32_to_cpup(p++) != *rqstp->rq_seqnos) goto bad_seqno; /* gss_unwrap redacts the opaque blob from the head iovec. @@ -2093,7 +2098,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); return -EIO; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p)); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, be32_to_cpup(--p)); return -EIO; bad_unwrap: trace_rpcgss_unwrap(task, maj_stat); @@ -2118,14 +2123,14 @@ gss_xmit_need_reencode(struct rpc_task *task) if (!ctx) goto out; - if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) + if (gss_seq_is_newer(*req->rq_seqnos, READ_ONCE(ctx->gc_seq))) goto out_ctx; seq_xmit = READ_ONCE(ctx->gc_seq_xmit); - while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { + while (gss_seq_is_newer(*req->rq_seqnos, seq_xmit)) { u32 tmp = seq_xmit; - seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); + seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, *req->rq_seqnos); if (seq_xmit == tmp) { ret = false; goto out_ctx; @@ -2134,7 +2139,7 @@ gss_xmit_need_reencode(struct rpc_task *task) win = ctx->gc_win; if (win > 0) - ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); + ret = !gss_seq_is_newer(*req->rq_seqnos, seq_xmit - win); out_ctx: gss_put_ctx(ctx); diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 9a27201638e2..16dcf115de1e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -138,60 +138,6 @@ out: return ret; } -/** - * krb5_decrypt - simple decryption of an RPCSEC GSS payload - * @tfm: initialized cipher transform - * @iv: pointer to an IV - * @in: ciphertext to decrypt - * @out: OUT: plaintext - * @length: length of input and output buffers, in bytes - * - * @iv may be NULL to force the use of an all-zero IV. - * The buffer containing the IV must be as large as the - * cipher's ivsize. - * - * Return values: - * %0: @in successfully decrypted into @out - * negative errno: @in not decrypted - */ -u32 -krb5_decrypt( - struct crypto_sync_skcipher *tfm, - void * iv, - void * in, - void * out, - int length) -{ - u32 ret = -EINVAL; - struct scatterlist sg[1]; - u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - - if (length % crypto_sync_skcipher_blocksize(tfm) != 0) - goto out; - - if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { - dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", - crypto_sync_skcipher_ivsize(tfm)); - goto out; - } - if (iv) - memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm)); - - memcpy(out, in, length); - sg_init_one(sg, out, length); - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, length, local_iv); - - ret = crypto_skcipher_decrypt(req); - skcipher_request_zero(req); -out: - dprintk("RPC: gss_k5decrypt returns %d\n",ret); - return ret; -} - static int checksummer(struct scatterlist *sg, void *data) { @@ -202,96 +148,6 @@ checksummer(struct scatterlist *sg, void *data) return crypto_ahash_update(req); } -/* - * checksum the plaintext data and hdrlen bytes of the token header - * The checksum is performed over the first 8 bytes of the - * gss token header and then over the data body - */ -u32 -make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, - struct xdr_buf *body, int body_offset, u8 *cksumkey, - unsigned int usage, struct xdr_netobj *cksumout) -{ - struct crypto_ahash *tfm; - struct ahash_request *req; - struct scatterlist sg[1]; - int err = -1; - u8 *checksumdata; - unsigned int checksumlen; - - if (cksumout->len < kctx->gk5e->cksumlength) { - dprintk("%s: checksum buffer length, %u, too small for %s\n", - __func__, cksumout->len, kctx->gk5e->name); - return GSS_S_FAILURE; - } - - checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL); - if (checksumdata == NULL) - return GSS_S_FAILURE; - - tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - goto out_free_cksum; - - req = ahash_request_alloc(tfm, GFP_KERNEL); - if (!req) - goto out_free_ahash; - - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - - checksumlen = crypto_ahash_digestsize(tfm); - - if (cksumkey != NULL) { - err = crypto_ahash_setkey(tfm, cksumkey, - kctx->gk5e->keylength); - if (err) - goto out; - } - - err = crypto_ahash_init(req); - if (err) - goto out; - sg_init_one(sg, header, hdrlen); - ahash_request_set_crypt(req, sg, NULL, hdrlen); - err = crypto_ahash_update(req); - if (err) - goto out; - err = xdr_process_buf(body, body_offset, body->len - body_offset, - checksummer, req); - if (err) - goto out; - ahash_request_set_crypt(req, NULL, checksumdata, 0); - err = crypto_ahash_final(req); - if (err) - goto out; - - switch (kctx->gk5e->ctype) { - case CKSUMTYPE_RSA_MD5: - err = krb5_encrypt(kctx->seq, NULL, checksumdata, - checksumdata, checksumlen); - if (err) - goto out; - memcpy(cksumout->data, - checksumdata + checksumlen - kctx->gk5e->cksumlength, - kctx->gk5e->cksumlength); - break; - case CKSUMTYPE_HMAC_SHA1_DES3: - memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); - break; - default: - BUG(); - break; - } - cksumout->len = kctx->gk5e->cksumlength; -out: - ahash_request_free(req); -out_free_ahash: - crypto_free_ahash(tfm); -out_free_cksum: - kfree(checksumdata); - return err ? GSS_S_FAILURE : 0; -} - /** * gss_krb5_checksum - Compute the MAC for a GSS Wrap or MIC token * @tfm: an initialized hash transform @@ -1019,8 +875,8 @@ out_err: * krb5_etm_decrypt - Decrypt using the RFC 8009 rules * @kctx: Kerberos context * @offset: starting offset of the ciphertext, in bytes - * @len: - * @buf: + * @len: size of ciphertext to unwrap + * @buf: ciphertext to unwrap * @headskip: OUT: the enctype's confounder length, in octets * @tailskip: OUT: the enctype's HMAC length, in octets * diff --git a/net/sunrpc/auth_gss/gss_krb5_internal.h b/net/sunrpc/auth_gss/gss_krb5_internal.h index a47e9ec228a5..8769e9e705bf 100644 --- a/net/sunrpc/auth_gss/gss_krb5_internal.h +++ b/net/sunrpc/auth_gss/gss_krb5_internal.h @@ -155,10 +155,6 @@ static inline int krb5_derive_key(struct krb5_ctx *kctx, void krb5_make_confounder(u8 *p, int conflen); -u32 make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, - struct xdr_buf *body, int body_offset, u8 *cksumkey, - unsigned int usage, struct xdr_netobj *cksumout); - u32 gss_krb5_checksum(struct crypto_ahash *tfm, char *header, int hdrlen, const struct xdr_buf *body, int body_offset, struct xdr_netobj *cksumout); @@ -166,9 +162,6 @@ u32 gss_krb5_checksum(struct crypto_ahash *tfm, char *header, int hdrlen, u32 krb5_encrypt(struct crypto_sync_skcipher *key, void *iv, void *in, void *out, int length); -u32 krb5_decrypt(struct crypto_sync_skcipher *key, void *iv, void *in, - void *out, int length); - int xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 3366505bc669..6db64a9111a9 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -473,7 +473,7 @@ gss_krb5_import_sec_context(const void *p, size_t len, struct gss_ctx *ctx_id, struct krb5_ctx *ctx; int ret; - ctx = kzalloc(sizeof(*ctx), gfp_mask); + ctx = kzalloc_obj(*ctx, gfp_mask); if (ctx == NULL) return -ENOMEM; diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c index a5bff02cd7ba..dde1ee934d0d 100644 --- a/net/sunrpc/auth_gss/gss_krb5_test.c +++ b/net/sunrpc/auth_gss/gss_krb5_test.c @@ -63,10 +63,11 @@ static void kdf_case(struct kunit *test) KUNIT_ASSERT_EQ(test, err, 0); /* Assert */ - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - derivedkey.data, derivedkey.len), 0, - "key mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + derivedkey.data, + derivedkey.len, + "key mismatch"); } static void checksum_case(struct kunit *test) @@ -111,10 +112,11 @@ static void checksum_case(struct kunit *test) KUNIT_ASSERT_EQ(test, err, 0); /* Assert */ - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - checksum.data, checksum.len), 0, - "checksum mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + checksum.data, + checksum.len, + "checksum mismatch"); crypto_free_ahash(tfm); } @@ -314,10 +316,11 @@ static void rfc3961_nfold_case(struct kunit *test) param->expected_result->len * 8, result); /* Assert */ - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - result, param->expected_result->len), 0, - "result mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + result, + param->expected_result->len, + "result mismatch"); } static struct kunit_case rfc3961_test_cases[] = { @@ -569,14 +572,16 @@ static void rfc3962_encrypt_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, buf.len, "ciphertext length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - text, param->expected_result->len), 0, - "ciphertext mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->next_iv->data, iv, - param->next_iv->len), 0, - "IV mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + text, + param->expected_result->len, + "ciphertext mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->next_iv->data, + iv, + param->next_iv->len, + "IV mismatch"); crypto_free_sync_skcipher(cts_tfm); crypto_free_sync_skcipher(cbc_tfm); @@ -1194,15 +1199,17 @@ static void rfc6803_encrypt_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, buf.len + checksum.len, "ciphertext length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - buf.head[0].iov_base, buf.len), 0, - "encrypted result mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data + - (param->expected_result->len - checksum.len), - checksum.data, checksum.len), 0, - "HMAC mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + buf.head[0].iov_base, + buf.len, + "encrypted result mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data + + (param->expected_result->len - checksum.len), + checksum.data, + checksum.len, + "HMAC mismatch"); crypto_free_ahash(ahash_tfm); crypto_free_sync_skcipher(cts_tfm); @@ -1687,15 +1694,16 @@ static void rfc8009_encrypt_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, buf.len, "ciphertext length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - buf.head[0].iov_base, - param->expected_result->len), 0, - "ciphertext mismatch"); - KUNIT_EXPECT_EQ_MSG(test, memcmp(param->expected_hmac->data, - checksum.data, - checksum.len), 0, - "HMAC mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + buf.head[0].iov_base, + param->expected_result->len, + "ciphertext mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_hmac->data, + checksum.data, + checksum.len, + "HMAC mismatch"); crypto_free_ahash(ahash_tfm); crypto_free_sync_skcipher(cts_tfm); @@ -1826,10 +1834,11 @@ static void encrypt_selftest_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->plaintext->len, buf.len, "length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->plaintext->data, - buf.head[0].iov_base, buf.len), 0, - "plaintext mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->plaintext->data, + buf.head[0].iov_base, + buf.len, + "plaintext mismatch"); crypto_free_sync_skcipher(cts_tfm); crypto_free_sync_skcipher(cbc_tfm); diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index c84d0cf61980..78eab245f94a 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -355,7 +355,7 @@ gss_import_sec_context(const void *input_token, size_t bufsize, time64_t *endtime, gfp_t gfp_mask) { - if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask))) + if (!(*ctx_id = kzalloc_obj(**ctx_id, gfp_mask))) return -ENOMEM; (*ctx_id)->mech_type = gss_mech_get(mech); diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index f549e4c05def..0fa4778620d9 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -214,7 +214,7 @@ static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) unsigned int i; arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); - arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); + arg->pages = kzalloc_objs(struct page *, arg->npages); if (!arg->pages) return -ENOMEM; for (i = 0; i < arg->npages; i++) { diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index cb32ab9a8395..fceee648d545 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -244,11 +244,11 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, /* we recognize only 1 currently: CREDS_VALUE */ oa->count = 1; - oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL); + oa->data = kmalloc_obj(struct gssx_option); if (!oa->data) return -ENOMEM; - creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL); + creds = kzalloc_obj(struct svc_cred); if (!creds) { err = -ENOMEM; goto free_oa; @@ -320,29 +320,47 @@ static int gssx_dec_status(struct xdr_stream *xdr, /* status->minor_status */ p = xdr_inline_decode(xdr, 8); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto out_free_mech; + } p = xdr_decode_hyper(p, &status->minor_status); /* status->major_status_string */ err = gssx_dec_buffer(xdr, &status->major_status_string); if (err) - return err; + goto out_free_mech; /* status->minor_status_string */ err = gssx_dec_buffer(xdr, &status->minor_status_string); if (err) - return err; + goto out_free_major_status_string; /* status->server_ctx */ err = gssx_dec_buffer(xdr, &status->server_ctx); if (err) - return err; + goto out_free_minor_status_string; /* we assume we have no options for now, so simply consume them */ /* status->options */ err = dummy_dec_opt_array(xdr, &status->options); + if (err) + goto out_free_server_ctx; + return 0; + +out_free_server_ctx: + kfree(status->server_ctx.data); + status->server_ctx.data = NULL; +out_free_minor_status_string: + kfree(status->minor_status_string.data); + status->minor_status_string.data = NULL; +out_free_major_status_string: + kfree(status->major_status_string.data); + status->major_status_string.data = NULL; +out_free_mech: + kfree(status->mech.data); + status->mech.data = NULL; return err; } @@ -505,28 +523,35 @@ static int gssx_dec_name(struct xdr_stream *xdr, /* name->name_type */ err = gssx_dec_buffer(xdr, &dummy_netobj); if (err) - return err; + goto out_free_display_name; /* name->exported_name */ err = gssx_dec_buffer(xdr, &dummy_netobj); if (err) - return err; + goto out_free_display_name; /* name->exported_composite_name */ err = gssx_dec_buffer(xdr, &dummy_netobj); if (err) - return err; + goto out_free_display_name; /* we assume we have no attributes for now, so simply consume them */ /* name->name_attributes */ err = dummy_dec_nameattr_array(xdr, &dummy_name_attr_array); if (err) - return err; + goto out_free_display_name; /* we assume we have no options for now, so simply consume them */ /* name->extensions */ err = dummy_dec_opt_array(xdr, &dummy_option_array); + if (err) + goto out_free_display_name; + return 0; + +out_free_display_name: + kfree(name->display_name.data); + name->display_name.data = NULL; return err; } @@ -649,32 +674,34 @@ static int gssx_dec_ctx(struct xdr_stream *xdr, /* ctx->state */ err = gssx_dec_buffer(xdr, &ctx->state); if (err) - return err; + goto out_free_exported_context_token; /* ctx->need_release */ err = gssx_dec_bool(xdr, &ctx->need_release); if (err) - return err; + goto out_free_state; /* ctx->mech */ err = gssx_dec_buffer(xdr, &ctx->mech); if (err) - return err; + goto out_free_state; /* ctx->src_name */ err = gssx_dec_name(xdr, &ctx->src_name); if (err) - return err; + goto out_free_mech; /* ctx->targ_name */ err = gssx_dec_name(xdr, &ctx->targ_name); if (err) - return err; + goto out_free_src_name; /* ctx->lifetime */ p = xdr_inline_decode(xdr, 8+8); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto out_free_targ_name; + } p = xdr_decode_hyper(p, &ctx->lifetime); /* ctx->ctx_flags */ @@ -683,17 +710,36 @@ static int gssx_dec_ctx(struct xdr_stream *xdr, /* ctx->locally_initiated */ err = gssx_dec_bool(xdr, &ctx->locally_initiated); if (err) - return err; + goto out_free_targ_name; /* ctx->open */ err = gssx_dec_bool(xdr, &ctx->open); if (err) - return err; + goto out_free_targ_name; /* we assume we have no options for now, so simply consume them */ /* ctx->options */ err = dummy_dec_opt_array(xdr, &ctx->options); + if (err) + goto out_free_targ_name; + + return 0; +out_free_targ_name: + kfree(ctx->targ_name.display_name.data); + ctx->targ_name.display_name.data = NULL; +out_free_src_name: + kfree(ctx->src_name.display_name.data); + ctx->src_name.display_name.data = NULL; +out_free_mech: + kfree(ctx->mech.data); + ctx->mech.data = NULL; +out_free_state: + kfree(ctx->state.data); + ctx->state.data = NULL; +out_free_exported_context_token: + kfree(ctx->exported_context_token.data); + ctx->exported_context_token.data = NULL; return err; } @@ -794,12 +840,12 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, struct gssx_res_accept_sec_context *res = data; u32 value_follows; int err; - struct page *scratch; + struct folio *scratch; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (!scratch) return -ENOMEM; - xdr_set_scratch_page(xdr, scratch); + xdr_set_scratch_folio(xdr, scratch); /* res->status */ err = gssx_dec_status(xdr, &res->status); @@ -844,6 +890,6 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, err = gssx_dec_option_array(xdr, &res->options); out_free: - __free_page(scratch); + folio_put(scratch); return err; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 73a90ad873fb..161d02cc1c2c 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -197,7 +197,7 @@ static void update_rsi(struct cache_head *cnew, struct cache_head *citem) static struct cache_head *rsi_alloc(void) { - struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); + struct rsi *rsii = kmalloc_obj(*rsii); if (rsii) return &rsii->h; else @@ -449,7 +449,7 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp) static struct cache_head * rsc_alloc(void) { - struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); + struct rsc *rsci = kmalloc_obj(*rsci); if (rsci) return &rsci->h; else @@ -724,7 +724,7 @@ svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } - if (flavor != RPC_AUTH_GSS) { + if (flavor != RPC_AUTH_GSS || checksum.len < XDR_UNIT) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -814,7 +814,7 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) struct auth_domain *test; int stat = -ENOMEM; - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc_obj(*new); if (!new) goto out; kref_init(&new->h.ref); @@ -1069,7 +1069,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, goto out_denied_free; pages = DIV_ROUND_UP(inlen, PAGE_SIZE); - in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + in_token->pages = kzalloc_objs(struct page *, pages + 1); if (!in_token->pages) goto out_denied_free; in_token->page_base = 0; @@ -1083,7 +1083,8 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, } length = min_t(unsigned int, inlen, (char *)xdr->end - (char *)xdr->p); - memcpy(page_address(in_token->pages[0]), xdr->p, length); + if (length) + memcpy(page_address(in_token->pages[0]), xdr->p, length); inlen -= length; to_offs = length; @@ -1628,9 +1629,9 @@ svcauth_gss_accept(struct svc_rqst *rqstp) int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - rqstp->rq_auth_stat = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_failed; if (!svcdata) - svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); + svcdata = kmalloc_obj(*svcdata); if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; @@ -1638,6 +1639,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp) svcdata->rsci = NULL; gc = &svcdata->clcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcauth_gss_decode_credbody(&rqstp->rq_arg_stream, gc, &rpcstart)) goto auth_err; if (gc->gc_v != RPC_GSS_VERSION) diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 1e091d3fa607..6c742a3400c4 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -45,7 +45,7 @@ static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth, { struct rpc_cred *ret; - ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask()); + ret = kmalloc_obj(*ret, rpc_task_gfp_mask()); if (!ret) { if (!(flags & RPCAUTH_LOOKUP_ASYNC)) return ERR_PTR(-ENOMEM); diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index caa94cf57123..0ffa4d01a938 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -25,6 +25,22 @@ unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt) } /* + * Helper function to nullify backchannel server pointer in transport. + * We need to synchronize setting the pointer to NULL (done so after + * the backchannel server is shutdown) with the usage of that pointer + * by the backchannel request processing routines + * xprt_complete_bc_request() and rpcrdma_bc_receive_call(). + */ +void xprt_svc_destroy_nullify_bc(struct rpc_xprt *xprt, struct svc_serv **serv) +{ + spin_lock(&xprt->bc_pa_lock); + svc_destroy(serv); + xprt->bc_serv = NULL; + spin_unlock(&xprt->bc_pa_lock); +} +EXPORT_SYMBOL_GPL(xprt_svc_destroy_nullify_bc); + +/* * Helper routines that track the number of preallocation elements * on the transport. */ @@ -78,7 +94,7 @@ static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt) struct rpc_rqst *req; /* Pre-allocate one backchannel rpc_rqst */ - req = kzalloc(sizeof(*req), gfp_flags); + req = kzalloc_obj(*req, gfp_flags); if (req == NULL) return NULL; @@ -131,7 +147,7 @@ EXPORT_SYMBOL_GPL(xprt_setup_backchannel); int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) { struct rpc_rqst *req; - struct list_head tmp_list; + LIST_HEAD(tmp_list); int i; dprintk("RPC: setup backchannel transport\n"); @@ -147,7 +163,6 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) * lock is held on the rpc_xprt struct. It also makes cleanup * easier in case of memory allocation errors. */ - INIT_LIST_HEAD(&tmp_list); for (i = 0; i < min_reqs; i++) { /* Pre-allocate one backchannel rpc_rqst */ req = xprt_alloc_bc_req(xprt); @@ -354,7 +369,6 @@ found: void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) { struct rpc_xprt *xprt = req->rq_xprt; - struct svc_serv *bc_serv = xprt->bc_serv; spin_lock(&xprt->bc_pa_lock); list_del(&req->rq_bc_pa_list); @@ -365,7 +379,21 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); dprintk("RPC: add callback request to list\n"); + xprt_enqueue_bc_request(req); +} + +void xprt_enqueue_bc_request(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + struct svc_serv *bc_serv; + xprt_get(xprt); - lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list); - svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]); + spin_lock(&xprt->bc_pa_lock); + bc_serv = xprt->bc_serv; + if (bc_serv) { + lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list); + svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]); + } + spin_unlock(&xprt->bc_pa_lock); } +EXPORT_SYMBOL_GPL(xprt_enqueue_bc_request); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 7ce5e28a6c03..27dd6b58b8ff 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -11,6 +11,7 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/file.h> +#include <linux/hex.h> #include <linux/slab.h> #include <linux/signal.h> #include <linux/sched.h> @@ -133,9 +134,11 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, return tmp; } + cache_get(new); hlist_add_head_rcu(&new->cache_list, head); detail->entries++; - cache_get(new); + if (detail->nextcheck > new->expiry_time) + detail->nextcheck = new->expiry_time + 1; spin_unlock(&detail->hash_lock); if (freeme) @@ -230,9 +233,9 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, spin_lock(&detail->hash_lock); cache_entry_update(detail, tmp, new); - hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); - detail->entries++; cache_get(tmp); + hlist_add_head_rcu(&tmp->cache_list, &detail->hash_table[hash]); + detail->entries++; cache_fresh_locked(tmp, new->expiry_time, detail); cache_fresh_locked(old, 0, detail); spin_unlock(&detail->hash_lock); @@ -396,7 +399,11 @@ static struct delayed_work cache_cleaner; void sunrpc_init_cache_detail(struct cache_detail *cd) { spin_lock_init(&cd->hash_lock); - INIT_LIST_HEAD(&cd->queue); + INIT_LIST_HEAD(&cd->requests); + INIT_LIST_HEAD(&cd->readers); + spin_lock_init(&cd->queue_lock); + init_waitqueue_head(&cd->queue_wait); + cd->next_seqno = 1; spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; @@ -462,24 +469,21 @@ static int cache_clean(void) } } + spin_lock(¤t_detail->hash_lock); + /* find a non-empty bucket in the table */ - while (current_detail && - current_index < current_detail->hash_size && + while (current_index < current_detail->hash_size && hlist_empty(¤t_detail->hash_table[current_index])) current_index++; /* find a cleanable entry in the bucket and clean it, or set to next bucket */ - - if (current_detail && current_index < current_detail->hash_size) { + if (current_index < current_detail->hash_size) { struct cache_head *ch = NULL; struct cache_detail *d; struct hlist_head *head; struct hlist_node *tmp; - spin_lock(¤t_detail->hash_lock); - /* Ok, now to clean this strand */ - head = ¤t_detail->hash_table[current_index]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { if (current_detail->nextcheck > ch->expiry_time) @@ -500,8 +504,10 @@ static int cache_clean(void) spin_unlock(&cache_list_lock); if (ch) sunrpc_end_cache_remove_entry(ch, d); - } else + } else { + spin_unlock(¤t_detail->hash_lock); spin_unlock(&cache_list_lock); + } return rv; } @@ -792,31 +798,20 @@ void cache_clean_deferred(void *owner) * On read, you get a full request, or block. * On write, an update request is processed. * Poll works if anything to read, and always allows write. - * - * Implemented by linked list of requests. Each open file has - * a ->private that also exists in this list. New requests are added - * to the end and may wakeup and preceding readers. - * New readers are added to the head. If, on read, an item is found with - * CACHE_UPCALLING clear, we free it from the list. - * */ -static DEFINE_SPINLOCK(queue_lock); - -struct cache_queue { - struct list_head list; - int reader; /* if 0, then request */ -}; struct cache_request { - struct cache_queue q; + struct list_head list; struct cache_head *item; - char * buf; + char *buf; int len; int readers; + u64 seqno; }; struct cache_reader { - struct cache_queue q; + struct list_head list; int offset; /* if non-0, we have a refcnt on next request */ + u64 next_seqno; }; static int cache_request(struct cache_detail *detail, @@ -831,6 +826,17 @@ static int cache_request(struct cache_detail *detail, return PAGE_SIZE - len; } +static struct cache_request * +cache_next_request(struct cache_detail *cd, u64 seqno) +{ + struct cache_request *rq; + + list_for_each_entry(rq, &cd->requests, list) + if (rq->seqno >= seqno) + return rq; + return NULL; +} + static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { @@ -845,25 +851,18 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); /* need to find next request */ - while (rp->q.list.next != &cd->queue && - list_entry(rp->q.list.next, struct cache_queue, list) - ->reader) { - struct list_head *next = rp->q.list.next; - list_move(&rp->q.list, next); - } - if (rp->q.list.next == &cd->queue) { - spin_unlock(&queue_lock); + rq = cache_next_request(cd, rp->next_seqno); + if (!rq) { + spin_unlock(&cd->queue_lock); inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } - rq = container_of(rp->q.list.next, struct cache_request, q.list); - WARN_ON_ONCE(rq->q.reader); if (rp->offset == 0) rq->readers++; - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); if (rq->len == 0) { err = cache_request(cd, rq); @@ -874,9 +873,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { err = -EAGAIN; - spin_lock(&queue_lock); - list_move(&rp->q.list, &rq->q.list); - spin_unlock(&queue_lock); + rp->next_seqno = rq->seqno + 1; } else { if (rp->offset + count > rq->len) count = rq->len - rp->offset; @@ -886,26 +883,24 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, rp->offset += count; if (rp->offset >= rq->len) { rp->offset = 0; - spin_lock(&queue_lock); - list_move(&rp->q.list, &rq->q.list); - spin_unlock(&queue_lock); + rp->next_seqno = rq->seqno + 1; } err = 0; } out: if (rp->offset == 0) { /* need to release rq */ - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); rq->readers--; if (rq->readers == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { - list_del(&rq->q.list); - spin_unlock(&queue_lock); + list_del(&rq->list); + spin_unlock(&cd->queue_lock); cache_put(rq->item, cd); kfree(rq->buf); kfree(rq); } else - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } if (err == -EAGAIN) goto again; @@ -969,16 +964,13 @@ out: return ret; } -static DECLARE_WAIT_QUEUE_HEAD(queue_wait); - static __poll_t cache_poll(struct file *filp, poll_table *wait, struct cache_detail *cd) { __poll_t mask; struct cache_reader *rp = filp->private_data; - struct cache_queue *cq; - poll_wait(filp, &queue_wait, wait); + poll_wait(filp, &cd->queue_wait, wait); /* alway allow write */ mask = EPOLLOUT | EPOLLWRNORM; @@ -986,15 +978,11 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, if (!rp) return mask; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) - if (!cq->reader) { - mask |= EPOLLIN | EPOLLRDNORM; - break; - } - spin_unlock(&queue_lock); + if (cache_next_request(cd, rp->next_seqno)) + mask |= EPOLLIN | EPOLLRDNORM; + spin_unlock(&cd->queue_lock); return mask; } @@ -1004,25 +992,20 @@ static int cache_ioctl(struct inode *ino, struct file *filp, { int len = 0; struct cache_reader *rp = filp->private_data; - struct cache_queue *cq; + struct cache_request *rq; if (cmd != FIONREAD || !rp) return -EINVAL; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); /* only find the length remaining in current request, * or the length of the next request */ - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) - if (!cq->reader) { - struct cache_request *cr = - container_of(cq, struct cache_request, q); - len = cr->len - rp->offset; - break; - } - spin_unlock(&queue_lock); + rq = cache_next_request(cd, rp->next_seqno); + if (rq) + len = rq->len - rp->offset; + spin_unlock(&cd->queue_lock); return put_user(len, (int __user *)arg); } @@ -1036,17 +1019,17 @@ static int cache_open(struct inode *inode, struct file *filp, return -EACCES; nonseekable_open(inode, filp); if (filp->f_mode & FMODE_READ) { - rp = kmalloc(sizeof(*rp), GFP_KERNEL); + rp = kmalloc_obj(*rp); if (!rp) { module_put(cd->owner); return -ENOMEM; } rp->offset = 0; - rp->q.reader = 1; + rp->next_seqno = 0; - spin_lock(&queue_lock); - list_add(&rp->q.list, &cd->queue); - spin_unlock(&queue_lock); + spin_lock(&cd->queue_lock); + list_add(&rp->list, &cd->readers); + spin_unlock(&cd->queue_lock); } if (filp->f_mode & FMODE_WRITE) atomic_inc(&cd->writers); @@ -1060,24 +1043,35 @@ static int cache_release(struct inode *inode, struct file *filp, struct cache_reader *rp = filp->private_data; if (rp) { - spin_lock(&queue_lock); + struct cache_request *rq = NULL; + + spin_lock(&cd->queue_lock); if (rp->offset) { - struct cache_queue *cq; - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) - if (!cq->reader) { - container_of(cq, struct cache_request, q) - ->readers--; - break; + struct cache_request *cr; + + cr = cache_next_request(cd, rp->next_seqno); + if (cr) { + cr->readers--; + if (cr->readers == 0 && + !test_bit(CACHE_PENDING, + &cr->item->flags)) { + list_del(&cr->list); + rq = cr; } + } rp->offset = 0; } - list_del(&rp->q.list); - spin_unlock(&queue_lock); + list_del(&rp->list); + spin_unlock(&cd->queue_lock); + + if (rq) { + cache_put(rq->item, cd); + kfree(rq->buf); + kfree(rq); + } filp->private_data = NULL; kfree(rp); - } if (filp->f_mode & FMODE_WRITE) { atomic_dec(&cd->writers); @@ -1091,27 +1085,24 @@ static int cache_release(struct inode *inode, struct file *filp, static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) { - struct cache_queue *cq, *tmp; - struct cache_request *cr; + struct cache_request *cr, *tmp; LIST_HEAD(dequeued); - spin_lock(&queue_lock); - list_for_each_entry_safe(cq, tmp, &detail->queue, list) - if (!cq->reader) { - cr = container_of(cq, struct cache_request, q); - if (cr->item != ch) - continue; - if (test_bit(CACHE_PENDING, &ch->flags)) - /* Lost a race and it is pending again */ - break; - if (cr->readers != 0) - continue; - list_move(&cr->q.list, &dequeued); - } - spin_unlock(&queue_lock); + spin_lock(&detail->queue_lock); + list_for_each_entry_safe(cr, tmp, &detail->requests, list) { + if (cr->item != ch) + continue; + if (test_bit(CACHE_PENDING, &ch->flags)) + /* Lost a race and it is pending again */ + break; + if (cr->readers != 0) + continue; + list_move(&cr->list, &dequeued); + } + spin_unlock(&detail->queue_lock); while (!list_empty(&dequeued)) { - cr = list_entry(dequeued.next, struct cache_request, q.list); - list_del(&cr->q.list); + cr = list_entry(dequeued.next, struct cache_request, list); + list_del(&cr->list); cache_put(cr->item, detail); kfree(cr->buf); kfree(cr); @@ -1223,26 +1214,26 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) if (!buf) return -EAGAIN; - crq = kmalloc(sizeof (*crq), GFP_KERNEL); + crq = kmalloc_obj(*crq); if (!crq) { kfree(buf); return -EAGAIN; } - crq->q.reader = 0; crq->buf = buf; crq->len = 0; crq->readers = 0; - spin_lock(&queue_lock); + spin_lock(&detail->queue_lock); if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); - list_add_tail(&crq->q.list, &detail->queue); + crq->seqno = detail->next_seqno++; + list_add_tail(&crq->list, &detail->requests); trace_cache_entry_upcall(detail, h); } else /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; - spin_unlock(&queue_lock); - wake_up(&queue_wait); + spin_unlock(&detail->queue_lock); + wake_up(&detail->queue_wait); if (ret == -EAGAIN) { kfree(buf); kfree(crq); @@ -1357,21 +1348,20 @@ static void *__cache_seq_start(struct seq_file *m, loff_t *pos) hash = n >> 32; entry = n & ((1LL<<32) - 1); + if (hash >= cd->hash_size) + return NULL; + hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; - n &= ~((1LL<<32) - 1); - do { - hash++; - n += 1LL<<32; - } while(hash < cd->hash_size && - hlist_empty(&cd->hash_table[hash])); - if (hash >= cd->hash_size) - return NULL; - *pos = n+1; - return hlist_entry_safe(rcu_dereference_raw( + ch = NULL; + while (!ch && ++hash < cd->hash_size) + ch = hlist_entry_safe(rcu_dereference( hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); + + *pos = ((long long)hash << 32) + 1; + return ch; } static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) @@ -1380,29 +1370,29 @@ static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) int hash = (*pos >> 32); struct cache_detail *cd = m->private; - if (p == SEQ_START_TOKEN) + if (p == SEQ_START_TOKEN) { hash = 0; - else if (ch->cache_list.next == NULL) { - hash++; - *pos += 1LL<<32; - } else { - ++*pos; - return hlist_entry_safe(rcu_dereference_raw( - hlist_next_rcu(&ch->cache_list)), - struct cache_head, cache_list); + ch = NULL; } - *pos &= ~((1LL<<32) - 1); - while (hash < cd->hash_size && - hlist_empty(&cd->hash_table[hash])) { + while (hash < cd->hash_size) { + if (ch) + ch = hlist_entry_safe( + rcu_dereference( + hlist_next_rcu(&ch->cache_list)), + struct cache_head, cache_list); + else + ch = hlist_entry_safe( + rcu_dereference( + hlist_first_rcu(&cd->hash_table[hash])), + struct cache_head, cache_list); + if (ch) { + ++*pos; + return ch; + } hash++; - *pos += 1LL<<32; + *pos = (long long)hash << 32; } - if (hash >= cd->hash_size) - return NULL; - ++*pos; - return hlist_entry_safe(rcu_dereference_raw( - hlist_first_rcu(&cd->hash_table[hash])), - struct cache_head, cache_list); + return NULL; } void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos) @@ -1743,8 +1733,7 @@ struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct ne if (cd == NULL) return ERR_PTR(-ENOMEM); - cd->hash_table = kcalloc(cd->hash_size, sizeof(struct hlist_head), - GFP_KERNEL); + cd->hash_table = kzalloc_objs(struct hlist_head, cd->hash_size); if (cd->hash_table == NULL) { kfree(cd); return ERR_PTR(-ENOMEM); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2fe88ea79a70..bc8ca470718b 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -112,47 +112,46 @@ static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt) } } -static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb, +static int rpc_setup_pipedir_sb(struct super_block *sb, struct rpc_clnt *clnt) { static uint32_t clntid; const char *dir_name = clnt->cl_program->pipe_dir_name; char name[15]; - struct dentry *dir, *dentry; + struct dentry *dir; + int err; dir = rpc_d_lookup_sb(sb, dir_name); if (dir == NULL) { pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name); - return dir; + return -ENOENT; } for (;;) { snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++); name[sizeof(name) - 1] = '\0'; - dentry = rpc_create_client_dir(dir, name, clnt); - if (!IS_ERR(dentry)) + err = rpc_create_client_dir(dir, name, clnt); + if (!err) break; - if (dentry == ERR_PTR(-EEXIST)) + if (err == -EEXIST) continue; printk(KERN_INFO "RPC: Couldn't create pipefs entry" - " %s/%s, error %ld\n", - dir_name, name, PTR_ERR(dentry)); + " %s/%s, error %d\n", + dir_name, name, err); break; } dput(dir); - return dentry; + return err; } static int rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt) { - struct dentry *dentry; - clnt->pipefs_sb = pipefs_sb; if (clnt->cl_program->pipe_dir_name != NULL) { - dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + int err = rpc_setup_pipedir_sb(pipefs_sb, clnt); + if (err && err != -ENOENT) + return err; } return 0; } @@ -180,16 +179,9 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { - struct dentry *dentry; - switch (event) { case RPC_PIPEFS_MOUNT: - dentry = rpc_setup_pipedir_sb(sb, clnt); - if (!dentry) - return -ENOENT; - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - break; + return rpc_setup_pipedir_sb(sb, clnt); case RPC_PIPEFS_UMOUNT: __rpc_clnt_remove_pipedir(clnt); break; @@ -270,9 +262,6 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, old = rcu_dereference_protected(clnt->cl_xprt, lockdep_is_held(&clnt->cl_lock)); - if (!xprt_bound(xprt)) - clnt->cl_autobind = 1; - clnt->cl_timeout = timeout; rcu_assign_pointer(clnt->cl_xprt, xprt); spin_unlock(&clnt->cl_lock); @@ -385,7 +374,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, goto out_err; err = -ENOMEM; - clnt = kzalloc(sizeof(*clnt), GFP_KERNEL); + clnt = kzalloc_obj(*clnt); if (!clnt) goto out_err; clnt->cl_parent = parent ? : clnt; @@ -512,6 +501,8 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, clnt->cl_discrtry = 1; if (!(args->flags & RPC_CLNT_CREATE_QUIET)) clnt->cl_chatty = 1; + if (args->flags & RPC_CLNT_CREATE_NETUNREACH_FATAL) + clnt->cl_netunreach_fatal = 1; return clnt; } @@ -662,6 +653,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; + new->cl_netunreach_fatal = clnt->cl_netunreach_fatal; new->cl_principal = clnt->cl_principal; new->cl_max_connect = clnt->cl_max_connect; return new; @@ -1195,6 +1187,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) task->tk_flags |= RPC_TASK_TIMEOUT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; + if (clnt->cl_netunreach_fatal) + task->tk_flags |= RPC_TASK_NETUNREACH_FATAL; atomic_inc(&clnt->cl_task_count); } @@ -1463,12 +1457,12 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, switch (sap->sa_family) { case AF_INET: err = kernel_bind(sock, - (struct sockaddr *)&rpc_inaddr_loopback, + (struct sockaddr_unsized *)&rpc_inaddr_loopback, sizeof(rpc_inaddr_loopback)); break; case AF_INET6: err = kernel_bind(sock, - (struct sockaddr *)&rpc_in6addr_loopback, + (struct sockaddr_unsized *)&rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); break; default: @@ -1480,7 +1474,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, goto out_release; } - err = kernel_connect(sock, sap, salen, 0); + err = kernel_connect(sock, (struct sockaddr_unsized *)sap, salen, 0); if (err < 0) { dprintk("RPC: can't connect UDP socket (%d)\n", err); goto out_release; @@ -2102,14 +2096,17 @@ call_bind_status(struct rpc_task *task) case -EPROTONOSUPPORT: trace_rpcb_bind_version_err(task); goto retry_timeout; + case -ENETDOWN: + case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + break; + fallthrough; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: - case -ENETDOWN: case -EHOSTUNREACH: - case -ENETUNREACH: case -EPIPE: trace_rpcb_unreachable_err(task); if (!RPC_IS_SOFTCONN(task)) { @@ -2191,19 +2188,22 @@ call_connect_status(struct rpc_task *task) task->tk_status = 0; switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + break; + fallthrough; case -ECONNREFUSED: case -ECONNRESET: /* A positive refusal suggests a rebind is needed. */ - if (RPC_IS_SOFTCONN(task)) - break; if (clnt->cl_autobind) { rpc_force_rebind(clnt); + if (RPC_IS_SOFTCONN(task)) + break; goto out_retry; } fallthrough; case -ECONNABORTED: - case -ENETDOWN: - case -ENETUNREACH: case -EHOSTUNREACH: case -EPIPE: case -EPROTO: @@ -2455,10 +2455,13 @@ call_status(struct rpc_task *task) trace_rpc_call_status(task); task->tk_status = 0; switch(status) { - case -EHOSTDOWN: case -ENETDOWN: - case -EHOSTUNREACH: case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + goto out_exit; + fallthrough; + case -EHOSTDOWN: + case -EHOSTUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) goto out_exit; @@ -2760,8 +2763,13 @@ out_verifier: case -EPROTONOSUPPORT: goto out_err; case -EACCES: - /* Re-encode with a fresh cred */ - fallthrough; + /* possible RPCSEC_GSS out-of-sequence event (RFC2203), + * reset recv state and keep waiting, don't retransmit + */ + task->tk_rqstp->rq_reply_bytes_recvd = 0; + task->tk_status = xprt_request_enqueue_receive(task); + task->tk_action = call_transmit_status; + return -EBADMSG; default: goto out_garbage; } @@ -2968,7 +2976,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, return -EINVAL; } - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kmalloc_obj(*data); if (!data) return -ENOMEM; data->xps = xprt_switch_get(xps); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index eadc00410ebc..9d349cfbc483 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -168,8 +168,9 @@ rpc_inode_setowner(struct inode *inode, void *private) } static void -rpc_close_pipes(struct inode *inode) +rpc_close_pipes(struct dentry *dentry) { + struct inode *inode = dentry->d_inode; struct rpc_pipe *pipe = RPC_I(inode)->pipe; int need_release; LIST_HEAD(free_list); @@ -484,60 +485,6 @@ rpc_get_inode(struct super_block *sb, umode_t mode) return inode; } -static int __rpc_create_common(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private) -{ - struct inode *inode; - - d_drop(dentry); - inode = rpc_get_inode(dir->i_sb, mode); - if (!inode) - goto out_err; - inode->i_ino = iunique(dir->i_sb, 100); - if (i_fop) - inode->i_fop = i_fop; - if (private) - rpc_inode_setowner(inode, private); - d_add(dentry, inode); - return 0; -out_err: - printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %pd\n", - __FILE__, __func__, dentry); - dput(dentry); - return -ENOMEM; -} - -static int __rpc_create(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private) -{ - int err; - - err = __rpc_create_common(dir, dentry, S_IFREG | mode, i_fop, private); - if (err) - return err; - fsnotify_create(dir, dentry); - return 0; -} - -static int __rpc_mkdir(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private) -{ - int err; - - err = __rpc_create_common(dir, dentry, S_IFDIR | mode, i_fop, private); - if (err) - return err; - inc_nlink(dir); - fsnotify_mkdir(dir, dentry); - return 0; -} - static void init_pipe(struct rpc_pipe *pipe) { @@ -564,7 +511,7 @@ struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags) { struct rpc_pipe *pipe; - pipe = kzalloc(sizeof(struct rpc_pipe), GFP_KERNEL); + pipe = kzalloc_obj(struct rpc_pipe); if (!pipe) return ERR_PTR(-ENOMEM); init_pipe(pipe); @@ -574,119 +521,58 @@ struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags) } EXPORT_SYMBOL_GPL(rpc_mkpipe_data); -static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private, - struct rpc_pipe *pipe) +static int rpc_new_file(struct dentry *parent, + const char *name, + umode_t mode, + const struct file_operations *i_fop, + void *private) { - struct rpc_inode *rpci; - int err; + struct dentry *dentry = simple_start_creating(parent, name); + struct inode *dir = parent->d_inode; + struct inode *inode; - err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private); - if (err) - return err; - rpci = RPC_I(d_inode(dentry)); - rpci->private = private; - rpci->pipe = pipe; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + inode = rpc_get_inode(dir->i_sb, S_IFREG | mode); + if (unlikely(!inode)) { + simple_done_creating(dentry); + return -ENOMEM; + } + inode->i_ino = iunique(dir->i_sb, 100); + if (i_fop) + inode->i_fop = i_fop; + rpc_inode_setowner(inode, private); + d_make_persistent(dentry, inode); fsnotify_create(dir, dentry); + simple_done_creating(dentry); return 0; } -static int __rpc_rmdir(struct inode *dir, struct dentry *dentry) +static struct dentry *rpc_new_dir(struct dentry *parent, + const char *name, + umode_t mode) { - int ret; - - dget(dentry); - ret = simple_rmdir(dir, dentry); - d_drop(dentry); - if (!ret) - fsnotify_rmdir(dir, dentry); - dput(dentry); - return ret; -} - -static int __rpc_unlink(struct inode *dir, struct dentry *dentry) -{ - int ret; - - dget(dentry); - ret = simple_unlink(dir, dentry); - d_drop(dentry); - if (!ret) - fsnotify_unlink(dir, dentry); - dput(dentry); - return ret; -} - -static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - - rpc_close_pipes(inode); - return __rpc_unlink(dir, dentry); -} + struct dentry *dentry = simple_start_creating(parent, name); + struct inode *dir = parent->d_inode; + struct inode *inode; -static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, - const char *name) -{ - struct qstr q = QSTR(name); - struct dentry *dentry = d_hash_and_lookup(parent, &q); - if (!dentry) { - dentry = d_alloc(parent, &q); - if (!dentry) - return ERR_PTR(-ENOMEM); - } - if (d_really_is_negative(dentry)) + if (IS_ERR(dentry)) return dentry; - dput(dentry); - return ERR_PTR(-EEXIST); -} - -/* - * FIXME: This probably has races. - */ -static void __rpc_depopulate(struct dentry *parent, - const struct rpc_filelist *files, - int start, int eof) -{ - struct inode *dir = d_inode(parent); - struct dentry *dentry; - struct qstr name; - int i; - - for (i = start; i < eof; i++) { - name.name = files[i].name; - name.len = strlen(files[i].name); - dentry = d_hash_and_lookup(parent, &name); - if (dentry == NULL) - continue; - if (d_really_is_negative(dentry)) - goto next; - switch (d_inode(dentry)->i_mode & S_IFMT) { - default: - BUG(); - case S_IFREG: - __rpc_unlink(dir, dentry); - break; - case S_IFDIR: - __rpc_rmdir(dir, dentry); - } -next: - dput(dentry); + inode = rpc_get_inode(dir->i_sb, S_IFDIR | mode); + if (unlikely(!inode)) { + simple_done_creating(dentry); + return ERR_PTR(-ENOMEM); } -} -static void rpc_depopulate(struct dentry *parent, - const struct rpc_filelist *files, - int start, int eof) -{ - struct inode *dir = d_inode(parent); + inode->i_ino = iunique(dir->i_sb, 100); + inc_nlink(dir); + d_make_persistent(dentry, inode); + fsnotify_mkdir(dir, dentry); + simple_done_creating(dentry); - inode_lock_nested(dir, I_MUTEX_CHILD); - __rpc_depopulate(parent, files, start, eof); - inode_unlock(dir); + return dentry; // borrowed } static int rpc_populate(struct dentry *parent, @@ -694,92 +580,39 @@ static int rpc_populate(struct dentry *parent, int start, int eof, void *private) { - struct inode *dir = d_inode(parent); struct dentry *dentry; int i, err; - inode_lock(dir); for (i = start; i < eof; i++) { - dentry = __rpc_lookup_create_exclusive(parent, files[i].name); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_bad; switch (files[i].mode & S_IFMT) { default: BUG(); case S_IFREG: - err = __rpc_create(dir, dentry, + err = rpc_new_file(parent, + files[i].name, files[i].mode, files[i].i_fop, private); + if (err) + goto out_bad; break; case S_IFDIR: - err = __rpc_mkdir(dir, dentry, - files[i].mode, - NULL, - private); + dentry = rpc_new_dir(parent, + files[i].name, + files[i].mode); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_bad; + } } - if (err != 0) - goto out_bad; } - inode_unlock(dir); return 0; out_bad: - __rpc_depopulate(parent, files, start, eof); - inode_unlock(dir); printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", __FILE__, __func__, parent); return err; } -static struct dentry *rpc_mkdir_populate(struct dentry *parent, - const char *name, umode_t mode, void *private, - int (*populate)(struct dentry *, void *), void *args_populate) -{ - struct dentry *dentry; - struct inode *dir = d_inode(parent); - int error; - - inode_lock_nested(dir, I_MUTEX_PARENT); - dentry = __rpc_lookup_create_exclusive(parent, name); - if (IS_ERR(dentry)) - goto out; - error = __rpc_mkdir(dir, dentry, mode, NULL, private); - if (error != 0) - goto out_err; - if (populate != NULL) { - error = populate(dentry, args_populate); - if (error) - goto err_rmdir; - } -out: - inode_unlock(dir); - return dentry; -err_rmdir: - __rpc_rmdir(dir, dentry); -out_err: - dentry = ERR_PTR(error); - goto out; -} - -static int rpc_rmdir_depopulate(struct dentry *dentry, - void (*depopulate)(struct dentry *)) -{ - struct dentry *parent; - struct inode *dir; - int error; - - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock_nested(dir, I_MUTEX_PARENT); - if (depopulate != NULL) - depopulate(dentry); - error = __rpc_rmdir(dir, dentry); - inode_unlock(dir); - dput(parent); - return error; -} - /** * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace * communication @@ -799,11 +632,13 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, * The @private argument passed here will be available to all these methods * from the file pointer, via RPC_I(file_inode(file))->private. */ -struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, +int rpc_mkpipe_dentry(struct dentry *parent, const char *name, void *private, struct rpc_pipe *pipe) { - struct dentry *dentry; struct inode *dir = d_inode(parent); + struct dentry *dentry; + struct inode *inode; + struct rpc_inode *rpci; umode_t umode = S_IFIFO | 0600; int err; @@ -812,48 +647,52 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (pipe->ops->downcall == NULL) umode &= ~0222; - inode_lock_nested(dir, I_MUTEX_PARENT); - dentry = __rpc_lookup_create_exclusive(parent, name); - if (IS_ERR(dentry)) - goto out; - err = __rpc_mkpipe_dentry(dir, dentry, umode, &rpc_pipe_fops, - private, pipe); - if (err) - goto out_err; -out: - inode_unlock(dir); - return dentry; -out_err: - dentry = ERR_PTR(err); - printk(KERN_WARNING "%s: %s() failed to create pipe %pd/%s (errno = %d)\n", - __FILE__, __func__, parent, name, - err); - goto out; + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto failed; + } + + inode = rpc_get_inode(dir->i_sb, umode); + if (unlikely(!inode)) { + simple_done_creating(dentry); + err = -ENOMEM; + goto failed; + } + inode->i_ino = iunique(dir->i_sb, 100); + inode->i_fop = &rpc_pipe_fops; + rpci = RPC_I(inode); + rpci->private = private; + rpci->pipe = pipe; + rpc_inode_setowner(inode, private); + pipe->dentry = dentry; // borrowed + d_make_persistent(dentry, inode); + fsnotify_create(dir, dentry); + simple_done_creating(dentry); + return 0; + +failed: + pr_warn("%s() failed to create pipe %pd/%s (errno = %d)\n", + __func__, parent, name, err); + return err; } EXPORT_SYMBOL_GPL(rpc_mkpipe_dentry); /** * rpc_unlink - remove a pipe - * @dentry: dentry for the pipe, as returned from rpc_mkpipe + * @pipe: the pipe to be removed * * After this call, lookups will no longer find the pipe, and any * attempts to read or write using preexisting opens of the pipe will * return -EPIPE. */ -int -rpc_unlink(struct dentry *dentry) +void +rpc_unlink(struct rpc_pipe *pipe) { - struct dentry *parent; - struct inode *dir; - int error = 0; - - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock_nested(dir, I_MUTEX_PARENT); - error = __rpc_rmpipe(dir, dentry); - inode_unlock(dir); - dput(parent); - return error; + if (pipe->dentry) { + simple_recursive_removal(pipe->dentry, rpc_close_pipes); + pipe->dentry = NULL; + } } EXPORT_SYMBOL_GPL(rpc_unlink); @@ -1010,31 +849,6 @@ rpc_destroy_pipe_dir_objects(struct rpc_pipe_dir_head *pdh) pdo->pdo_ops->destroy(dir, pdo); } -enum { - RPCAUTH_info, - RPCAUTH_EOF -}; - -static const struct rpc_filelist authfiles[] = { - [RPCAUTH_info] = { - .name = "info", - .i_fop = &rpc_info_operations, - .mode = S_IFREG | 0400, - }, -}; - -static int rpc_clntdir_populate(struct dentry *dentry, void *private) -{ - return rpc_populate(dentry, - authfiles, RPCAUTH_info, RPCAUTH_EOF, - private); -} - -static void rpc_clntdir_depopulate(struct dentry *dentry) -{ - rpc_depopulate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF); -} - /** * rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs * @dentry: the parent of new directory @@ -1046,19 +860,27 @@ static void rpc_clntdir_depopulate(struct dentry *dentry) * information about the client, together with any "pipes" that may * later be created using rpc_mkpipe(). */ -struct dentry *rpc_create_client_dir(struct dentry *dentry, - const char *name, - struct rpc_clnt *rpc_client) +int rpc_create_client_dir(struct dentry *dentry, + const char *name, + struct rpc_clnt *rpc_client) { struct dentry *ret; + int err; - ret = rpc_mkdir_populate(dentry, name, 0555, NULL, - rpc_clntdir_populate, rpc_client); - if (!IS_ERR(ret)) { - rpc_client->cl_pipedir_objects.pdh_dentry = ret; - rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects); + ret = rpc_new_dir(dentry, name, 0555); + if (IS_ERR(ret)) + return PTR_ERR(ret); + err = rpc_new_file(ret, "info", S_IFREG | 0400, + &rpc_info_operations, rpc_client); + if (err) { + pr_warn("%s failed to populate directory %pd\n", + __func__, ret); + simple_recursive_removal(ret, NULL); + return err; } - return ret; + rpc_client->cl_pipedir_objects.pdh_dentry = ret; + rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects); + return 0; } /** @@ -1073,7 +895,8 @@ int rpc_remove_client_dir(struct rpc_clnt *rpc_client) return 0; rpc_destroy_pipe_dir_objects(&rpc_client->cl_pipedir_objects); rpc_client->cl_pipedir_objects.pdh_dentry = NULL; - return rpc_rmdir_depopulate(dentry, rpc_clntdir_depopulate); + simple_recursive_removal(dentry, NULL); + return 0; } static const struct rpc_filelist cache_pipefs_files[3] = { @@ -1094,28 +917,25 @@ static const struct rpc_filelist cache_pipefs_files[3] = { }, }; -static int rpc_cachedir_populate(struct dentry *dentry, void *private) -{ - return rpc_populate(dentry, - cache_pipefs_files, 0, 3, - private); -} - -static void rpc_cachedir_depopulate(struct dentry *dentry) -{ - rpc_depopulate(dentry, cache_pipefs_files, 0, 3); -} - struct dentry *rpc_create_cache_dir(struct dentry *parent, const char *name, umode_t umode, struct cache_detail *cd) { - return rpc_mkdir_populate(parent, name, umode, NULL, - rpc_cachedir_populate, cd); + struct dentry *dentry; + + dentry = rpc_new_dir(parent, name, umode); + if (!IS_ERR(dentry)) { + int error = rpc_populate(dentry, cache_pipefs_files, 0, 3, cd); + if (error) { + simple_recursive_removal(dentry, NULL); + return ERR_PTR(error); + } + } + return dentry; } void rpc_remove_cache_dir(struct dentry *dentry) { - rpc_rmdir_depopulate(dentry, rpc_cachedir_depopulate); + simple_recursive_removal(dentry, NULL); } /* @@ -1141,7 +961,6 @@ enum { RPCAUTH_nfsd4_cb, RPCAUTH_cache, RPCAUTH_nfsd, - RPCAUTH_gssd, RPCAUTH_RootEOF }; @@ -1178,10 +997,6 @@ static const struct rpc_filelist files[] = { .name = "nfsd", .mode = S_IFDIR | 0555, }, - [RPCAUTH_gssd] = { - .name = "gssd", - .mode = S_IFDIR | 0555, - }, }; /* @@ -1190,7 +1005,7 @@ static const struct rpc_filelist files[] = { struct dentry *rpc_d_lookup_sb(const struct super_block *sb, const unsigned char *dir_name) { - return d_hash_and_lookup(sb->s_root, &QSTR(dir_name)); + return try_lookup_noperm(&QSTR(dir_name), sb->s_root); } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); @@ -1241,13 +1056,6 @@ void rpc_put_sb_net(const struct net *net) } EXPORT_SYMBOL_GPL(rpc_put_sb_net); -static const struct rpc_filelist gssd_dummy_clnt_dir[] = { - [0] = { - .name = "clntXX", - .mode = S_IFDIR | 0555, - }, -}; - static ssize_t dummy_downcall(struct file *filp, const char __user *src, size_t len) { @@ -1276,14 +1084,6 @@ rpc_dummy_info_show(struct seq_file *m, void *v) } DEFINE_SHOW_ATTRIBUTE(rpc_dummy_info); -static const struct rpc_filelist gssd_dummy_info_file[] = { - [0] = { - .name = "info", - .i_fop = &rpc_dummy_info_fops, - .mode = S_IFREG | 0400, - }, -}; - /** * rpc_gssd_dummy_populate - create a dummy gssd pipe * @root: root of the rpc_pipefs filesystem @@ -1292,69 +1092,32 @@ static const struct rpc_filelist gssd_dummy_info_file[] = { * Create a dummy set of directories and a pipe that gssd can hold open to * indicate that it is up and running. */ -static struct dentry * +static int rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) { - int ret = 0; - struct dentry *gssd_dentry; - struct dentry *clnt_dentry = NULL; - struct dentry *pipe_dentry = NULL; - - /* We should never get this far if "gssd" doesn't exist */ - gssd_dentry = d_hash_and_lookup(root, &QSTR(files[RPCAUTH_gssd].name)); - if (!gssd_dentry) - return ERR_PTR(-ENOENT); - - ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL); - if (ret) { - pipe_dentry = ERR_PTR(ret); - goto out; - } - - clnt_dentry = d_hash_and_lookup(gssd_dentry, - &QSTR(gssd_dummy_clnt_dir[0].name)); - if (!clnt_dentry) { - __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); - pipe_dentry = ERR_PTR(-ENOENT); - goto out; - } + struct dentry *gssd_dentry, *clnt_dentry; + int err; - ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL); - if (ret) { - __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); - pipe_dentry = ERR_PTR(ret); - goto out; - } + gssd_dentry = rpc_new_dir(root, "gssd", 0555); + if (IS_ERR(gssd_dentry)) + return -ENOENT; - pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); - if (IS_ERR(pipe_dentry)) { - __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1); - __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); - } -out: - dput(clnt_dentry); - dput(gssd_dentry); - return pipe_dentry; -} + clnt_dentry = rpc_new_dir(gssd_dentry, "clntXX", 0555); + if (IS_ERR(clnt_dentry)) + return -ENOENT; -static void -rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) -{ - struct dentry *clnt_dir = pipe_dentry->d_parent; - struct dentry *gssd_dir = clnt_dir->d_parent; - - dget(pipe_dentry); - __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry); - __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); - __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); - dput(pipe_dentry); + err = rpc_new_file(clnt_dentry, "info", 0400, + &rpc_dummy_info_fops, NULL); + if (!err) + err = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); + return err; } static int rpc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; - struct dentry *root, *gssd_dentry; + struct dentry *root; struct net *net = sb->s_fs_info; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1363,7 +1126,7 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = RPCAUTH_GSSMAGIC; sb->s_op = &s_ops; - sb->s_d_op = &simple_dentry_operations; + sb->s_d_flags = DCACHE_DONTCACHE; sb->s_time_gran = 1; inode = rpc_get_inode(sb, S_IFDIR | 0555); @@ -1373,11 +1136,9 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc) if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; - gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy); - if (IS_ERR(gssd_dentry)) { - __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); - return PTR_ERR(gssd_dentry); - } + err = rpc_gssd_dummy_populate(root, sn->gssd_dummy); + if (err) + return err; dprintk("RPC: sending pipefs MOUNT notification for net %x%s\n", net->ns.inum, NET_NAME(net)); @@ -1386,18 +1147,6 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc) err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_MOUNT, sb); - if (err) - goto err_depopulate; - mutex_unlock(&sn->pipefs_sb_lock); - return 0; - -err_depopulate: - rpc_gssd_dummy_depopulate(gssd_dentry); - blocking_notifier_call_chain(&rpc_pipefs_notifier_list, - RPC_PIPEFS_UMOUNT, - sb); - sn->pipefs_sb = NULL; - __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); mutex_unlock(&sn->pipefs_sb_lock); return err; } @@ -1454,7 +1203,7 @@ static void rpc_kill_sb(struct super_block *sb) sb); mutex_unlock(&sn->pipefs_sb_lock); out: - kill_litter_super(sb); + kill_anon_super(sb); put_net(net); } diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 102c3818bc54..6aa372188c86 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -737,7 +737,7 @@ void rpcb_getport_async(struct rpc_task *task) goto bailout_nofree; } - map = kzalloc(sizeof(struct rpcbind_args), rpc_task_gfp_mask()); + map = kzalloc_obj(struct rpcbind_args, rpc_task_gfp_mask()); if (!map) { status = -ENOMEM; goto bailout_release_client; @@ -820,9 +820,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) } trace_rpcb_setport(child, map->r_status, map->r_port); - xprt->ops->set_port(xprt, map->r_port); - if (map->r_port) + if (map->r_port) { + xprt->ops->set_port(xprt, map->r_port); xprt_set_bound(xprt); + } } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9b45fbdc90ca..016f16ca5779 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1074,7 +1074,6 @@ int rpc_malloc(struct rpc_task *task) rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } -EXPORT_SYMBOL_GPL(rpc_malloc); /** * rpc_free - free RPC buffer resources allocated via rpc_malloc @@ -1095,7 +1094,6 @@ void rpc_free(struct rpc_task *task) else kfree(buf); } -EXPORT_SYMBOL_GPL(rpc_free); /* * Creation and deletion of RPC task structures diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 1b2b84feeec6..d8d8842c7de5 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -27,135 +27,91 @@ struct xdr_skb_reader { struct sk_buff *skb; unsigned int offset; + bool need_checksum; size_t count; __wsum csum; }; -typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to, - size_t len); - /** * xdr_skb_read_bits - copy some data bits from skb to internal buffer * @desc: sk_buff copy helper * @to: copy destination * @len: number of bytes to copy * - * Possibly called several times to iterate over an sk_buff and copy - * data out of it. + * Possibly called several times to iterate over an sk_buff and copy data out of + * it. */ static size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) { - if (len > desc->count) - len = desc->count; - if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len))) - return 0; - desc->count -= len; - desc->offset += len; - return len; -} + len = min(len, desc->count); + + if (desc->need_checksum) { + __wsum csum; + + csum = skb_copy_and_csum_bits(desc->skb, desc->offset, to, len); + desc->csum = csum_block_add(desc->csum, csum, desc->offset); + } else { + if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len))) + return 0; + } -/** - * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer - * @desc: sk_buff copy helper - * @to: copy destination - * @len: number of bytes to copy - * - * Same as skb_read_bits, but calculate a checksum at the same time. - */ -static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, size_t len) -{ - unsigned int pos; - __wsum csum2; - - if (len > desc->count) - len = desc->count; - pos = desc->offset; - csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len); - desc->csum = csum_block_add(desc->csum, csum2, pos); desc->count -= len; desc->offset += len; return len; } -/** - * xdr_partial_copy_from_skb - copy data out of an skb - * @xdr: target XDR buffer - * @base: starting offset - * @desc: sk_buff copy helper - * @copy_actor: virtual method for copying data - * - */ static ssize_t -xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) +xdr_partial_copy_from_skb(struct xdr_buf *xdr, struct xdr_skb_reader *desc) { - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - ssize_t copied = 0; - size_t ret; - - len = xdr->head[0].iov_len; - if (base < len) { - len -= base; - ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); - copied += ret; - if (ret != len || !desc->count) - goto out; - base = 0; - } else - base -= len; - - if (unlikely(pglen == 0)) - goto copy_tail; - if (unlikely(base >= pglen)) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_SHIFT; - base &= ~PAGE_MASK; - } - do { + struct page **ppage = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + unsigned int poff = xdr->page_base & ~PAGE_MASK; + unsigned int pglen = xdr->page_len; + ssize_t copied = 0; + size_t ret; + + if (xdr->head[0].iov_len == 0) + return 0; + + ret = xdr_skb_read_bits(desc, xdr->head[0].iov_base, + xdr->head[0].iov_len); + if (ret != xdr->head[0].iov_len || !desc->count) + return ret; + copied += ret; + + while (pglen) { + unsigned int len = min(PAGE_SIZE - poff, pglen); char *kaddr; /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { - *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + *ppage = alloc_page(GFP_NOWAIT); if (unlikely(*ppage == NULL)) { if (copied == 0) - copied = -ENOMEM; - goto out; + return -ENOMEM; + return copied; } } - len = PAGE_SIZE; kaddr = kmap_atomic(*ppage); - if (base) { - len -= base; - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr + base, len); - base = 0; - } else { - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr, len); - } + ret = xdr_skb_read_bits(desc, kaddr + poff, len); flush_dcache_page(*ppage); kunmap_atomic(kaddr); + copied += ret; if (ret != len || !desc->count) - goto out; + return copied; ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) - copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); -out: + pglen -= len; + poff = 0; + } + + if (xdr->tail[0].iov_len) { + copied += xdr_skb_read_bits(desc, xdr->tail[0].iov_base, + xdr->tail[0].iov_len); + } + return copied; } @@ -169,17 +125,22 @@ out: */ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) { - struct xdr_skb_reader desc; - - desc.skb = skb; - desc.offset = 0; - desc.count = skb->len - desc.offset; + struct xdr_skb_reader desc = { + .skb = skb, + .count = skb->len - desc.offset, + }; - if (skb_csum_unnecessary(skb)) - goto no_checksum; + if (skb_csum_unnecessary(skb)) { + if (xdr_partial_copy_from_skb(xdr, &desc) < 0) + return -1; + if (desc.count) + return -1; + return 0; + } + desc.need_checksum = true; desc.csum = csum_partial(skb->data, desc.offset, skb->csum); - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_and_csum_bits) < 0) + if (xdr_partial_copy_from_skb(xdr, &desc) < 0) return -1; if (desc.offset != skb->len) { __wsum csum2; @@ -194,14 +155,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); return 0; -no_checksum: - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; } -EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr); static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg, size_t seek) diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 383860cb1d5b..7093e18ac26c 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -126,7 +126,7 @@ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) struct rpc_iostats *stats; int i; - stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL); + stats = kzalloc_objs(*stats, clnt->cl_maxproc); if (stats) { for (i = 0; i < clnt->cl_maxproc; i++) spin_lock_init(&stats[i].om_lock); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e7f9c295d13c..576fa42e7abf 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -352,7 +352,7 @@ static int svc_pool_map_get_node(unsigned int pidx) if (m->mode == SVC_POOL_PERNODE) return m->pool_to[pidx]; } - return NUMA_NO_NODE; + return numa_mem_id(); } /* * Set the given thread's cpus_allowed mask so that it @@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { @@ -489,7 +488,7 @@ __svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats, unsigned int xdrsize; unsigned int i; - if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) + if (!(serv = kzalloc_obj(*serv))) return NULL; serv->sv_name = prog->pg_name; serv->sv_programs = prog; @@ -524,8 +523,7 @@ __svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats, serv->sv_nrpools = npools; serv->sv_pools = - kcalloc(serv->sv_nrpools, sizeof(struct svc_pool), - GFP_KERNEL); + kzalloc_objs(struct svc_pool, serv->sv_nrpools); if (!serv->sv_pools) { kfree(serv); return NULL; @@ -636,24 +634,30 @@ svc_destroy(struct svc_serv **servp) EXPORT_SYMBOL_GPL(svc_destroy); static bool -svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) +svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { - unsigned long pages, ret; - - /* bc_xprt uses fore channel allocated buffers */ - if (svc_is_backchannel(rqstp)) - return true; - - pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply. - * We assume one is at most one page - */ - WARN_ON_ONCE(pages > RPCSVC_MAXPAGES); - if (pages > RPCSVC_MAXPAGES) - pages = RPCSVC_MAXPAGES; - - ret = alloc_pages_bulk_node(GFP_KERNEL, node, pages, - rqstp->rq_pages); - return ret == pages; + rqstp->rq_maxpages = svc_serv_maxpages(serv); + + /* +1 for a NULL sentinel readable by nfsd_splice_actor() */ + rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1, + sizeof(struct page *), + GFP_KERNEL, node); + if (!rqstp->rq_pages) + return false; + + /* +1 for a NULL sentinel at rq_page_end (see svc_rqst_replace_page) */ + rqstp->rq_respages = kcalloc_node(rqstp->rq_maxpages + 1, + sizeof(struct page *), + GFP_KERNEL, node); + if (!rqstp->rq_respages) { + kfree(rqstp->rq_pages); + rqstp->rq_pages = NULL; + return false; + } + + rqstp->rq_pages_nfree = rqstp->rq_maxpages; + rqstp->rq_next_page = rqstp->rq_respages + rqstp->rq_maxpages; + return true; } /* @@ -662,20 +666,31 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) static void svc_release_buffer(struct svc_rqst *rqstp) { - unsigned int i; + unsigned long i; - for (i = 0; i < ARRAY_SIZE(rqstp->rq_pages); i++) - if (rqstp->rq_pages[i]) - put_page(rqstp->rq_pages[i]); + if (rqstp->rq_pages) { + for (i = 0; i < rqstp->rq_maxpages; i++) + if (rqstp->rq_pages[i]) + put_page(rqstp->rq_pages[i]); + kfree(rqstp->rq_pages); + } + + if (rqstp->rq_respages) { + for (i = 0; i < rqstp->rq_maxpages; i++) + if (rqstp->rq_respages[i]) + put_page(rqstp->rq_respages[i]); + kfree(rqstp->rq_respages); + } } static void svc_rqst_free(struct svc_rqst *rqstp) { folio_batch_release(&rqstp->rq_fbatch); + kfree(rqstp->rq_bvec); svc_release_buffer(rqstp); - if (rqstp->rq_scratch_page) - put_page(rqstp->rq_scratch_page); + if (rqstp->rq_scratch_folio) + folio_put(rqstp->rq_scratch_folio); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); @@ -696,8 +711,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) rqstp->rq_server = serv; rqstp->rq_pool = pool; - rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); - if (!rqstp->rq_scratch_page) + rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node); + if (!rqstp->rq_scratch_folio) goto out_enomem; rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); @@ -708,7 +723,13 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp->rq_resp) goto out_enomem; - if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) + if (!svc_init_buffer(rqstp, serv, node)) + goto out_enomem; + + rqstp->rq_bvec = kcalloc_node(rqstp->rq_maxpages, + sizeof(struct bio_vec), + GFP_KERNEL, node); + if (!rqstp->rq_bvec) goto out_enomem; rqstp->rq_err = -EAGAIN; /* No error yet */ @@ -749,119 +770,101 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool) WRITE_ONCE(rqstp->rq_qtime, ktime_get()); if (!task_is_running(rqstp->rq_task)) { wake_up_process(rqstp->rq_task); - trace_svc_wake_up(rqstp->rq_task->pid); + trace_svc_pool_thread_wake(pool, rqstp->rq_task->pid); percpu_counter_inc(&pool->sp_threads_woken); + } else { + trace_svc_pool_thread_running(pool, rqstp->rq_task->pid); } rcu_read_unlock(); return; } rcu_read_unlock(); - + trace_svc_pool_thread_noidle(pool, 0); } EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread); -static struct svc_pool * -svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) -{ - return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools]; -} - -static struct svc_pool * -svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool, - unsigned int *state) +/** + * svc_new_thread - spawn a new thread in the given pool + * @serv: the serv to which the pool belongs + * @pool: pool in which thread should be spawned + * + * Create a new thread inside @pool, which is a part of @serv. + * Caller must hold the service mutex. + * + * Returns 0 on success, or -errno on failure. + */ +int svc_new_thread(struct svc_serv *serv, struct svc_pool *pool) { - struct svc_pool *pool; - unsigned int i; + struct svc_rqst *rqstp; + struct task_struct *task; + int node; + int err = 0; - pool = target_pool; + node = svc_pool_map_get_node(pool->sp_id); - if (!pool) { - for (i = 0; i < serv->sv_nrpools; i++) { - pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; - if (pool->sp_nrthreads) - break; - } + rqstp = svc_prepare_thread(serv, pool, node); + if (!rqstp) + return -ENOMEM; + task = kthread_create_on_node(serv->sv_threadfn, rqstp, + node, "%s", serv->sv_name); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto out; } - if (pool && pool->sp_nrthreads) { - set_bit(SP_VICTIM_REMAINS, &pool->sp_flags); - set_bit(SP_NEED_VICTIM, &pool->sp_flags); - return pool; - } - return NULL; + rqstp->rq_task = task; + if (serv->sv_nrpools > 1) + svc_pool_map_set_cpumask(task, pool->sp_id); + + svc_sock_update_bufs(serv); + wake_up_process(task); + + /* Wait for the thread to signal initialization status */ + wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN); + err = rqstp->rq_err; +out: + if (err) + svc_exit_thread(rqstp); + return err; } +EXPORT_SYMBOL_GPL(svc_new_thread); static int svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - struct svc_rqst *rqstp; - struct task_struct *task; - struct svc_pool *chosen_pool; - unsigned int state = serv->sv_nrthreads-1; - int node; - int err; - - do { - nrservs--; - chosen_pool = svc_pool_next(serv, pool, &state); - node = svc_pool_map_get_node(chosen_pool->sp_id); - - rqstp = svc_prepare_thread(serv, chosen_pool, node); - if (!rqstp) - return -ENOMEM; - task = kthread_create_on_node(serv->sv_threadfn, rqstp, - node, "%s", serv->sv_name); - if (IS_ERR(task)) { - svc_exit_thread(rqstp); - return PTR_ERR(task); - } - - rqstp->rq_task = task; - if (serv->sv_nrpools > 1) - svc_pool_map_set_cpumask(task, chosen_pool->sp_id); + int err = 0; - svc_sock_update_bufs(serv); - wake_up_process(task); + while (!err && nrservs--) + err = svc_new_thread(serv, pool); - wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN); - err = rqstp->rq_err; - if (err) { - svc_exit_thread(rqstp); - return err; - } - } while (nrservs > 0); - - return 0; + return err; } static int svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - unsigned int state = serv->sv_nrthreads-1; - struct svc_pool *victim; - do { - victim = svc_pool_victim(serv, pool, &state); - if (!victim) - break; - svc_pool_wake_idle_thread(victim); - wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS, - TASK_IDLE); + set_bit(SP_VICTIM_REMAINS, &pool->sp_flags); + set_bit(SP_NEED_VICTIM, &pool->sp_flags); + svc_pool_wake_idle_thread(pool); + wait_on_bit(&pool->sp_flags, SP_VICTIM_REMAINS, TASK_IDLE); nrservs++; } while (nrservs < 0); return 0; } /** - * svc_set_num_threads - adjust number of threads per RPC service + * svc_set_pool_threads - adjust number of threads per pool * @serv: RPC service to adjust - * @pool: Specific pool from which to choose threads, or NULL - * @nrservs: New number of threads for @serv (0 or less means kill all threads) + * @pool: Specific pool from which to choose threads + * @min_threads: min number of threads to run in @pool + * @max_threads: max number of threads in @pool (0 means kill all threads) * - * Create or destroy threads to make the number of threads for @serv the - * given number. If @pool is non-NULL, change only threads in that pool; - * otherwise, round-robin between all pools for @serv. @serv's - * sv_nrthreads is adjusted for each thread created or destroyed. + * Create or destroy threads in @pool to bring it into an acceptable range + * between @min_threads and @max_threads. + * + * If @min_threads is 0 or larger than @max_threads, then it is ignored and + * the pool will be set to run a static @max_threads number of threads. * * Caller must ensure mutual exclusion between this and server startup or * shutdown. @@ -870,27 +873,93 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) * starting a thread. */ int -svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +svc_set_pool_threads(struct svc_serv *serv, struct svc_pool *pool, + unsigned int min_threads, unsigned int max_threads) { + int delta; + if (!pool) - nrservs -= serv->sv_nrthreads; - else - nrservs -= pool->sp_nrthreads; + return -EINVAL; - if (nrservs > 0) - return svc_start_kthreads(serv, pool, nrservs); - if (nrservs < 0) - return svc_stop_kthreads(serv, pool, nrservs); + /* clamp min threads to the max */ + if (min_threads > max_threads) + min_threads = max_threads; + + pool->sp_nrthrmin = min_threads; + pool->sp_nrthrmax = max_threads; + + /* + * When min_threads is set, then only change the number of + * threads to bring it within an acceptable range. + */ + if (min_threads) { + if (pool->sp_nrthreads > max_threads) + delta = max_threads; + else if (pool->sp_nrthreads < min_threads) + delta = min_threads; + else + return 0; + } else { + delta = max_threads; + } + + delta -= pool->sp_nrthreads; + if (delta > 0) + return svc_start_kthreads(serv, pool, delta); + if (delta < 0) + return svc_stop_kthreads(serv, pool, delta); return 0; } +EXPORT_SYMBOL_GPL(svc_set_pool_threads); + +/** + * svc_set_num_threads - adjust number of threads in serv + * @serv: RPC service to adjust + * @min_threads: min number of threads to run per pool + * @nrservs: New number of threads for @serv (0 means kill all threads) + * + * Create or destroy threads in @serv to bring it to @nrservs. If there + * are multiple pools then the new threads or victims will be distributed + * evenly among them. + * + * Caller must ensure mutual exclusion between this and server startup or + * shutdown. + * + * Returns zero on success or a negative errno if an error occurred while + * starting a thread. On failure, some pools may have already been + * adjusted; the caller is responsible for recovery. + */ +int +svc_set_num_threads(struct svc_serv *serv, unsigned int min_threads, + unsigned int nrservs) +{ + unsigned int base = nrservs / serv->sv_nrpools; + unsigned int remain = nrservs % serv->sv_nrpools; + int i, err = 0; + + for (i = 0; i < serv->sv_nrpools; ++i) { + struct svc_pool *pool = &serv->sv_pools[i]; + int threads = base; + + if (remain) { + ++threads; + --remain; + } + + err = svc_set_pool_threads(serv, pool, min_threads, threads); + if (err) + break; + } + return err; +} EXPORT_SYMBOL_GPL(svc_set_num_threads); /** - * svc_rqst_replace_page - Replace one page in rq_pages[] + * svc_rqst_replace_page - Replace one page in rq_respages[] * @rqstp: svc_rqst with pages to replace * @page: replacement page * - * When replacing a page in rq_pages, batch the release of the + * When replacing a page in rq_respages, batch the release of the * replaced pages to avoid hammering the page allocator. * * Return values: @@ -899,19 +968,16 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); */ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) { - struct page **begin = rqstp->rq_pages; - struct page **end = &rqstp->rq_pages[RPCSVC_MAXPAGES]; + struct page **begin = rqstp->rq_respages; + struct page **end = rqstp->rq_page_end; if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) { trace_svc_replace_page_err(rqstp); return false; } - if (*rqstp->rq_next_page) { - if (!folio_batch_add(&rqstp->rq_fbatch, - page_folio(*rqstp->rq_next_page))) - __folio_batch_release(&rqstp->rq_fbatch); - } + if (*rqstp->rq_next_page) + svc_rqst_page_release(rqstp, *rqstp->rq_next_page); get_page(page); *(rqstp->rq_next_page++) = page; @@ -923,18 +989,24 @@ EXPORT_SYMBOL_GPL(svc_rqst_replace_page); * svc_rqst_release_pages - Release Reply buffer pages * @rqstp: RPC transaction context * - * Release response pages that might still be in flight after - * svc_send, and any spliced filesystem-owned pages. + * Release response pages in the range [rq_respages, rq_next_page). + * NULL entries in this range are skipped, allowing transports to + * transfer pages to a send context before this function runs. */ void svc_rqst_release_pages(struct svc_rqst *rqstp) { - int i, count = rqstp->rq_next_page - rqstp->rq_respages; - - if (count) { - release_pages(rqstp->rq_respages, count); - for (i = 0; i < count; i++) - rqstp->rq_respages[i] = NULL; + struct page **pp; + + for (pp = rqstp->rq_respages; pp < rqstp->rq_next_page; pp++) { + if (*pp) { + if (!folio_batch_add(&rqstp->rq_fbatch, + page_folio(*pp))) + __folio_batch_release(&rqstp->rq_fbatch); + *pp = NULL; + } } + if (rqstp->rq_fbatch.nr) + __folio_batch_release(&rqstp->rq_fbatch); } /** @@ -1330,6 +1402,9 @@ svc_process_common(struct svc_rqst *rqstp) int pr, rc; __be32 *p; + /* Reset the accept_stat for the RPC */ + rqstp->rq_accept_statp = NULL; + /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); clear_bit(RQ_DROPME, &rqstp->rq_flags); @@ -1369,9 +1444,8 @@ svc_process_common(struct svc_rqst *rqstp) case SVC_OK: break; case SVC_GARBAGE: - goto err_garbage_args; - case SVC_SYSERR: - goto err_system_err; + rqstp->rq_auth_stat = rpc_autherr_badcred; + goto err_bad_auth; case SVC_DENIED: goto err_bad_auth; case SVC_CLOSE: @@ -1382,7 +1456,8 @@ svc_process_common(struct svc_rqst *rqstp) goto sendit; default: pr_warn_once("Unexpected svc_auth_status (%d)\n", auth_res); - goto err_system_err; + rqstp->rq_auth_stat = rpc_autherr_failed; + goto err_bad_auth; } if (progp == NULL) @@ -1419,8 +1494,6 @@ svc_process_common(struct svc_rqst *rqstp) /* Call the function that processes the request. */ rc = process.dispatch(rqstp); - if (procp->pc_release) - procp->pc_release(rqstp); xdr_finish_decode(xdr); if (!rc) @@ -1509,20 +1582,6 @@ err_bad_proc: serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; - -err_garbage_args: - svc_printk(rqstp, "failed to decode RPC header\n"); - - if (serv->sv_stats) - serv->sv_stats->rpcbadfmt++; - *rqstp->rq_accept_statp = rpc_garbage_args; - goto sendit; - -err_system_err: - if (serv->sv_stats) - serv->sv_stats->rpcbadfmt++; - *rqstp->rq_accept_statp = rpc_system_err; - goto sendit; } /* @@ -1533,6 +1592,14 @@ static void svc_drop(struct svc_rqst *rqstp) trace_svc_drop(rqstp); } +static void svc_release_rqst(struct svc_rqst *rqstp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; + + if (procp && procp->pc_release) + procp->pc_release(rqstp); +} + /** * svc_process - Execute one RPC transaction * @rqstp: RPC transaction context @@ -1572,9 +1639,12 @@ void svc_process(struct svc_rqst *rqstp) if (unlikely(*p != rpc_call)) goto out_baddir; - if (!svc_process_common(rqstp)) + if (!svc_process_common(rqstp)) { + svc_release_rqst(rqstp); goto out_drop; + } svc_send(rqstp); + svc_release_rqst(rqstp); return; out_baddir: @@ -1642,6 +1712,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); + svc_release_rqst(rqstp); return; } /* Finally, send the reply synchronously */ @@ -1655,6 +1726,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) timeout.to_maxval = timeout.to_initval; memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req, &timeout); + svc_release_rqst(rqstp); if (IS_ERR(task)) return; @@ -1714,46 +1786,6 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** - * svc_fill_write_vector - Construct data argument for VFS write call - * @rqstp: svc_rqst to operate on - * @payload: xdr_buf containing only the write data payload - * - * Fills in rqstp::rq_vec, and returns the number of elements. - */ -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - struct xdr_buf *payload) -{ - struct page **pages = payload->pages; - struct kvec *first = payload->head; - struct kvec *vec = rqstp->rq_vec; - size_t total = payload->len; - unsigned int i; - - /* Some types of transport can present the write payload - * entirely in rq_arg.pages. In this case, @first is empty. - */ - i = 0; - if (first->iov_len) { - vec[i].iov_base = first->iov_base; - vec[i].iov_len = min_t(size_t, total, first->iov_len); - total -= vec[i].iov_len; - ++i; - } - - while (total) { - vec[i].iov_base = page_address(*pages); - vec[i].iov_len = min_t(size_t, total, PAGE_SIZE); - total -= vec[i].iov_len; - ++i; - ++pages; - } - - WARN_ON_ONCE(i > ARRAY_SIZE(rqstp->rq_vec)); - return i; -} -EXPORT_SYMBOL_GPL(svc_fill_write_vector); - -/** * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call * @rqstp: svc_rqst to operate on * @first: buffer containing first section of pathname diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ae25405d8bd2..b16e710926c1 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -488,6 +488,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) pool = svc_pool_for_cpu(xprt->xpt_server); percpu_counter_inc(&pool->sp_sockets_queued); + xprt->xpt_qtime = ktime_get(); lwq_enqueue(&xprt->xpt_ready, &pool->sp_xprts); svc_pool_wake_idle_thread(pool); @@ -649,22 +650,13 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -static bool svc_alloc_arg(struct svc_rqst *rqstp) +static bool svc_fill_pages(struct svc_rqst *rqstp, struct page **pages, + unsigned long npages) { - struct svc_serv *serv = rqstp->rq_server; - struct xdr_buf *arg = &rqstp->rq_arg; - unsigned long pages, filled, ret; - - pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; - if (pages > RPCSVC_MAXPAGES) { - pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n", - pages, RPCSVC_MAXPAGES); - /* use as many pages as possible */ - pages = RPCSVC_MAXPAGES; - } + unsigned long filled, ret; - for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); + for (filled = 0; filled < npages; filled = ret) { + ret = alloc_pages_bulk(GFP_KERNEL, npages, pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; @@ -674,11 +666,40 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) set_current_state(TASK_RUNNING); return false; } - trace_svc_alloc_arg_err(pages, ret); + trace_svc_alloc_arg_err(npages, ret); memalloc_retry_wait(GFP_KERNEL); } - rqstp->rq_page_end = &rqstp->rq_pages[pages]; - rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */ + return true; +} + +static bool svc_alloc_arg(struct svc_rqst *rqstp) +{ + struct xdr_buf *arg = &rqstp->rq_arg; + unsigned long pages, nfree; + + pages = rqstp->rq_maxpages; + + nfree = rqstp->rq_pages_nfree; + if (nfree) { + if (!svc_fill_pages(rqstp, rqstp->rq_pages, nfree)) + return false; + rqstp->rq_pages_nfree = 0; + } + + if (WARN_ON_ONCE(rqstp->rq_next_page < rqstp->rq_respages)) + return false; + nfree = rqstp->rq_next_page - rqstp->rq_respages; + if (nfree) { + if (!svc_fill_pages(rqstp, rqstp->rq_respages, nfree)) + return false; + } + + rqstp->rq_next_page = rqstp->rq_respages; + rqstp->rq_page_end = &rqstp->rq_respages[pages]; + /* svc_rqst_replace_page() dereferences *rq_next_page even + * at rq_page_end; NULL prevents releasing a garbage page. + */ + rqstp->rq_page_end[0] = NULL; /* Make arg->head point to first page and arg->pages point to rest */ arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); @@ -721,15 +742,21 @@ svc_thread_should_sleep(struct svc_rqst *rqstp) return true; } -static void svc_thread_wait_for_work(struct svc_rqst *rqstp) +static bool svc_schedule_timeout(long timeo) +{ + return schedule_timeout(timeo ? timeo : MAX_SCHEDULE_TIMEOUT) == 0; +} + +static bool svc_thread_wait_for_work(struct svc_rqst *rqstp, long timeo) { struct svc_pool *pool = rqstp->rq_pool; + bool did_timeout = false; if (svc_thread_should_sleep(rqstp)) { set_current_state(TASK_IDLE | TASK_FREEZABLE); llist_add(&rqstp->rq_idle, &pool->sp_idle_threads); if (likely(svc_thread_should_sleep(rqstp))) - schedule(); + did_timeout = svc_schedule_timeout(timeo); while (!llist_del_first_this(&pool->sp_idle_threads, &rqstp->rq_idle)) { @@ -741,7 +768,7 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp) * for this new work. This thread can safely sleep * until woken again. */ - schedule(); + did_timeout = svc_schedule_timeout(timeo); set_current_state(TASK_IDLE | TASK_FREEZABLE); } __set_current_state(TASK_RUNNING); @@ -749,6 +776,7 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp) cond_resched(); } try_to_freeze(); + return did_timeout; } static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) @@ -842,25 +870,38 @@ static void svc_thread_wake_next(struct svc_rqst *rqstp) /** * svc_recv - Receive and process the next request on any transport * @rqstp: an idle RPC service thread + * @timeo: timeout (in jiffies) (0 means infinite timeout) * * This code is carefully organised not to touch any cachelines in * the shared svc_serv structure, only cachelines in the local * svc_pool. + * + * If the timeout is 0, then the sleep will never time out. + * + * Returns -ETIMEDOUT if idle for an extended period + * -EBUSY if there is more work to do than available threads + * 0 otherwise. */ -void svc_recv(struct svc_rqst *rqstp) +int svc_recv(struct svc_rqst *rqstp, long timeo) { struct svc_pool *pool = rqstp->rq_pool; + bool did_timeout; + int ret = 0; if (!svc_alloc_arg(rqstp)) - return; + return ret; + + did_timeout = svc_thread_wait_for_work(rqstp, timeo); - svc_thread_wait_for_work(rqstp); + if (did_timeout && svc_thread_should_sleep(rqstp) && + pool->sp_nrthrmin && pool->sp_nrthreads > pool->sp_nrthrmin) + ret = -ETIMEDOUT; clear_bit(SP_TASK_PENDING, &pool->sp_flags); if (svc_thread_should_stop(rqstp)) { svc_thread_wake_next(rqstp); - return; + return ret; } rqstp->rq_xprt = svc_xprt_dequeue(pool); @@ -872,10 +913,22 @@ void svc_recv(struct svc_rqst *rqstp) * cache information to be provided. When there are no * idle threads, we reduce the wait time. */ - if (pool->sp_idle_threads.first) + if (pool->sp_idle_threads.first) { rqstp->rq_chandle.thread_wait = 5 * HZ; - else + } else { rqstp->rq_chandle.thread_wait = 1 * HZ; + /* + * No idle threads: signal -EBUSY so the caller + * can consider spawning another thread. Use + * SP_TASK_STARTING to limit this signal to one + * thread at a time; the caller clears this flag + * after starting a new thread. + */ + if (!did_timeout && timeo && + !test_and_set_bit(SP_TASK_STARTING, + &pool->sp_flags)) + ret = -EBUSY; + } trace_svc_xprt_dequeue(rqstp); svc_handle_xprt(rqstp, xprt); @@ -894,6 +947,7 @@ void svc_recv(struct svc_rqst *rqstp) } } #endif + return ret; } EXPORT_SYMBOL_GPL(svc_recv); @@ -929,7 +983,7 @@ void svc_send(struct svc_rqst *rqstp) */ static void svc_age_temp_xprts(struct timer_list *t) { - struct svc_serv *serv = from_timer(serv, t, sv_temptimer); + struct svc_serv *serv = timer_container_of(serv, t, sv_temptimer); struct svc_xprt *xprt; struct list_head *le, *next; @@ -1021,6 +1075,19 @@ static void svc_delete_xprt(struct svc_xprt *xprt) struct svc_serv *serv = xprt->xpt_server; struct svc_deferred_req *dr; + /* unregister with rpcbind for when transport type is TCP or UDP. + */ + if (test_bit(XPT_RPCB_UNREG, &xprt->xpt_flags)) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, + sk_xprt); + struct socket *sock = svsk->sk_sock; + + if (svc_register(serv, xprt->xpt_net, sock->sk->sk_family, + sock->sk->sk_protocol, 0) < 0) + pr_warn("failed to unregister %s with rpcbind\n", + xprt->xpt_class->xcl_name); + } + if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) return; @@ -1109,6 +1176,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1121,7 +1189,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1131,6 +1200,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); @@ -1233,7 +1305,6 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_addrlen = dr->addrlen; /* Save off transport header len in case we get deferred again */ rqstp->rq_daddr = dr->daddr; - rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_xprt_ctxt = dr->xprt_ctxt; dr->xprt_ctxt = NULL; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 8ca98b146ec8..3be69c145d2a 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -72,7 +72,7 @@ struct auth_domain *unix_domain_find(char *name) return rv; } - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc_obj(*new); if (new == NULL) return NULL; kref_init(&new->h.ref); @@ -143,7 +143,7 @@ static void update(struct cache_head *cnew, struct cache_head *citem) } static struct cache_head *ip_map_alloc(void) { - struct ip_map *i = kmalloc(sizeof(*i), GFP_KERNEL); + struct ip_map *i = kmalloc_obj(*i); if (i) return &i->h; else @@ -458,7 +458,7 @@ static void unix_gid_update(struct cache_head *cnew, struct cache_head *citem) } static struct cache_head *unix_gid_alloc(void) { - struct unix_gid *g = kmalloc(sizeof(*g), GFP_KERNEL); + struct unix_gid *g = kmalloc_obj(*g); if (g) return &g->h; else diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 72e5a01df3d3..7be3de1a1aed 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -68,6 +68,17 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +/* + * For UDP: + * 1 for header page + * enough pages for RPCSVC_MAXPAYLOAD_UDP + * 1 in case payload is not aligned + * 1 for tail page + */ +enum { + SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1 +}; + /* To-do: to avoid tying up an nfsd thread while waiting for a * handshake request, the request could instead be deferred. */ @@ -257,20 +268,47 @@ svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg, } static int -svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg) +svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags) { union { struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; - struct socket *sock = svsk->sk_sock; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; + int ret; + + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT); + if (ret > 0 && + tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { + iov_iter_revert(&msg.msg_iter, ret); + ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN); + } + return ret; +} + +static int +svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg) +{ int ret; + struct socket *sock = svsk->sk_sock; - msg->msg_control = &u; - msg->msg_controllen = sizeof(u); ret = sock_recvmsg(sock, msg, MSG_DONTWAIT); - if (unlikely(msg->msg_controllen != sizeof(u))) - ret = svc_tcp_sock_process_cmsg(sock, msg, &u.cmsg, ret); + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags); + } return ret; } @@ -313,15 +351,13 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0); - rqstp->rq_respages = &rqstp->rq_pages[i]; - rqstp->rq_next_page = rqstp->rq_respages + 1; iov_iter_bvec(&msg.msg_iter, ITER_DEST, bvec, i, buflen); if (seek) { iov_iter_advance(&msg.msg_iter, seek); buflen -= seek; } - len = svc_tcp_sock_recv_cmsg(svsk, &msg); + len = svc_tcp_sock_recvmsg(svsk, &msg); if (len > 0) svc_flush_bvec(bvec, len, seek); @@ -639,13 +675,9 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (len <= rqstp->rq_arg.head[0].iov_len) { rqstp->rq_arg.head[0].iov_len = len; rqstp->rq_arg.page_len = 0; - rqstp->rq_respages = rqstp->rq_pages+1; } else { rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; - rqstp->rq_respages = rqstp->rq_pages + 1 + - DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE); } - rqstp->rq_next_page = rqstp->rq_respages+1; if (serv->sv_stats) serv->sv_stats->netudpcnt++; @@ -713,15 +745,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - count = xdr_buf_to_bvec(rqstp->rq_bvec, - ARRAY_SIZE(rqstp->rq_bvec), xdr); + count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); } @@ -810,6 +841,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* data might have come in before data_ready set up */ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { @@ -956,7 +988,7 @@ static size_t svc_tcp_restore_pages(struct svc_sock *svsk, npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { if (rqstp->rq_pages[i] != NULL) - put_page(rqstp->rq_pages[i]); + svc_rqst_page_release(rqstp, rqstp->rq_pages[i]); BUG_ON(svsk->sk_pages[i] == NULL); rqstp->rq_pages[i] = svsk->sk_pages[i]; svsk->sk_pages[i] = NULL; @@ -977,6 +1009,7 @@ static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) svsk->sk_pages[i] = rqstp->rq_pages[i]; rqstp->rq_pages[i] = NULL; } + rqstp->rq_pages_nfree = npages; } static void svc_tcp_clear_pages(struct svc_sock *svsk) @@ -1019,7 +1052,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want); - len = svc_tcp_sock_recv_cmsg(svsk, &msg); + len = svc_tcp_sock_recvmsg(svsk, &msg); if (len < 0) return len; svsk->sk_tcplen += len; @@ -1035,9 +1068,10 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, return svc_sock_reclen(svsk); err_too_large: - net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n", - __func__, svsk->sk_xprt.xpt_server->sv_name, - svc_sock_reclen(svsk)); + net_notice_ratelimited("svc: %s oversized RPC fragment (%u octets) from %pISpc\n", + svsk->sk_xprt.xpt_server->sv_name, + svc_sock_reclen(svsk), + (struct sockaddr *)&svsk->sk_xprt.xpt_remote); svc_xprt_deferred_close(&svsk->sk_xprt); err_short: return -EAGAIN; @@ -1198,7 +1232,7 @@ err_noclose: * that the pages backing @xdr are unchanging. */ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, - rpc_fraghdr marker, unsigned int *sentp) + rpc_fraghdr marker) { struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES, @@ -1207,29 +1241,24 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, void *buf; int ret; - *sentp = 0; - /* The stream record marker is copied into a temporary page - * fragment buffer so that it can be included in rq_bvec. + * fragment buffer so that it can be included in sk_bvec. */ buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, &marker, sizeof(marker)); - bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); + bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker)); - count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, - ARRAY_SIZE(rqstp->rq_bvec) - 1, &rqstp->rq_res); + count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages, + &rqstp->rq_res); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); page_frag_free(buf); - if (ret < 0) - return ret; - *sentp += ret; - return 0; + return ret; } /** @@ -1248,8 +1277,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) struct xdr_buf *xdr = &rqstp->rq_res; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); - unsigned int sent; - int err; + int sent; svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); rqstp->rq_xprt_ctxt = NULL; @@ -1257,9 +1285,9 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = svc_tcp_sendmsg(svsk, rqstp, marker, &sent); - trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent); - if (err < 0 || sent != (xdr->len + sizeof(marker))) + sent = svc_tcp_sendmsg(svsk, rqstp, marker); + trace_svcsock_tcp_send(xprt, sent); + if (sent < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; mutex_unlock(&xprt->xpt_mutex); return sent; @@ -1268,10 +1296,10 @@ out_notconn: mutex_unlock(&xprt->xpt_mutex); return -ENOTCONN; out_close: - pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + pr_notice("rpc-srv/tcp: %s: %s %d when sending %zu bytes - shutting down socket\n", xprt->xpt_server->sv_name, - (err < 0) ? "got error" : "sent", - (err < 0) ? err : sent, xdr->len); + (sent < 0) ? "got error" : "sent", + sent, xdr->len + sizeof(marker)); svc_xprt_deferred_close(xprt); mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; @@ -1330,6 +1358,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) if (sk->sk_state == TCP_LISTEN) { strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { @@ -1340,7 +1369,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svsk->sk_marker = xdr_zero; svsk->sk_tcplen = 0; svsk->sk_datalen = 0; - memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); + memset(&svsk->sk_pages[0], 0, + svsk->sk_maxpages * sizeof(struct page *)); tcp_sock_set_nodelay(sk); @@ -1369,6 +1399,20 @@ void svc_sock_update_bufs(struct svc_serv *serv) spin_unlock_bh(&serv->sv_lock); } +static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags) +{ + switch (sock->type) { + case SOCK_STREAM: + /* +1 for TCP record marker */ + if (flags & SVC_SOCK_TEMPORARY) + return svc_serv_maxpages(serv) + 1; + return 0; + case SOCK_DGRAM: + return SUNRPC_MAX_UDP_SENDPAGES; + } + return -EINVAL; +} + /* * Initialize socket for RPC use and create svc_sock struct */ @@ -1379,11 +1423,28 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int sendpages; + unsigned long pages; - svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); + sendpages = svc_sock_sendpages(serv, sock, flags); + if (sendpages < 0) + return ERR_PTR(sendpages); + + pages = svc_serv_maxpages(serv); + svsk = kzalloc_flex(*svsk, sk_pages, pages); if (!svsk) return ERR_PTR(-ENOMEM); + if (sendpages) { + svsk->sk_bvec = kzalloc_objs(*svsk->sk_bvec, sendpages); + if (!svsk->sk_bvec) { + kfree(svsk); + return ERR_PTR(-ENOMEM); + } + } + + svsk->sk_maxpages = pages; + inet = sock->sk; if (pmap_register) { @@ -1393,6 +1454,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); if (err < 0) { + kfree(svsk->sk_bvec); kfree(svsk); return ERR_PTR(err); } @@ -1531,7 +1593,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ - error = kernel_bind(sock, sin, len); + error = kernel_bind(sock, (struct sockaddr_unsized *)sin, len); if (error < 0) goto bummer; @@ -1542,7 +1604,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, if (protocol == IPPROTO_TCP) { sk_net_refcnt_upgrade(sock->sk); - if ((error = kernel_listen(sock, 64)) < 0) + if ((error = kernel_listen(sock, SOMAXCONN)) < 0) goto bummer; } @@ -1610,5 +1672,6 @@ static void svc_sock_free(struct svc_xprt *xprt) sock_release(sock); page_frag_cache_drain(&svsk->sk_frag_cache); + kfree(svsk->sk_bvec); kfree(svsk); } diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 5c8ecdaaa985..a90480f80154 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -6,6 +6,7 @@ #include <linux/kobject.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtsock.h> +#include <net/net_namespace.h> #include "sysfs.h" @@ -48,7 +49,7 @@ static struct kobject *rpc_sysfs_object_alloc(const char *name, { struct kobject *kobj; - kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + kobj = kzalloc_obj(*kobj); if (kobj) { kobj->kset = kset; if (kobject_init_and_add(kobj, &rpc_sysfs_object_type, @@ -59,6 +60,16 @@ static struct kobject *rpc_sysfs_object_alloc(const char *name, return NULL; } +static inline struct rpc_clnt * +rpc_sysfs_client_kobj_get_clnt(struct kobject *kobj) +{ + struct rpc_sysfs_client *c = container_of(kobj, + struct rpc_sysfs_client, kobject); + struct rpc_clnt *ret = c->clnt; + + return refcount_inc_not_zero(&ret->cl_count) ? ret : NULL; +} + static inline struct rpc_xprt * rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) { @@ -86,6 +97,51 @@ rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) return xprt_switch_get(x->xprt_switch); } +static ssize_t rpc_sysfs_clnt_version_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); + ssize_t ret; + + if (!clnt) + return sprintf(buf, "<closed>\n"); + + ret = sprintf(buf, "%u", clnt->cl_vers); + refcount_dec(&clnt->cl_count); + return ret; +} + +static ssize_t rpc_sysfs_clnt_program_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); + ssize_t ret; + + if (!clnt) + return sprintf(buf, "<closed>\n"); + + ret = sprintf(buf, "%s", clnt->cl_program->name); + refcount_dec(&clnt->cl_count); + return ret; +} + +static ssize_t rpc_sysfs_clnt_max_connect_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); + ssize_t ret; + + if (!clnt) + return sprintf(buf, "<closed>\n"); + + ret = sprintf(buf, "%u\n", clnt->cl_max_connect); + refcount_dec(&clnt->cl_count); + return ret; +} + static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -129,6 +185,31 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, return ret; } +static const char *xprtsec_strings[] = { + [RPC_XPRTSEC_NONE] = "none", + [RPC_XPRTSEC_TLS_ANON] = "tls-anon", + [RPC_XPRTSEC_TLS_X509] = "tls-x509", +}; + +static ssize_t rpc_sysfs_xprt_xprtsec_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt) { + ret = sprintf(buf, "<closed>\n"); + goto out; + } + + ret = sprintf(buf, "%s\n", xprtsec_strings[xprt->xprtsec.policy]); + xprt_put(xprt); +out: + return ret; + +} + static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -206,6 +287,14 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, return ret; } +static ssize_t rpc_sysfs_xprt_del_xprt_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "# delete this xprt\n"); +} + + static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -225,6 +314,55 @@ static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, return ret; } +static ssize_t rpc_sysfs_xprt_switch_add_xprt_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "# add one xprt to this xprt_switch\n"); +} + +static ssize_t rpc_sysfs_xprt_switch_add_xprt_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt_switch *xprt_switch = + rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); + struct xprt_create xprt_create_args; + struct rpc_xprt *xprt, *new; + + if (!xprt_switch) + return 0; + + xprt = rpc_xprt_switch_get_main_xprt(xprt_switch); + if (!xprt) + goto out; + + xprt_create_args.ident = xprt->xprt_class->ident; + xprt_create_args.net = xprt->xprt_net; + xprt_create_args.dstaddr = (struct sockaddr *)&xprt->addr; + xprt_create_args.addrlen = xprt->addrlen; + xprt_create_args.servername = xprt->servername; + xprt_create_args.bc_xprt = xprt->bc_xprt; + xprt_create_args.xprtsec = xprt->xprtsec; + xprt_create_args.connect_timeout = xprt->connect_timeout; + xprt_create_args.reconnect_timeout = xprt->max_reconnect_timeout; + + new = xprt_create_transport(&xprt_create_args); + if (IS_ERR_OR_NULL(new)) { + count = PTR_ERR(new); + goto out_put_xprt; + } + + rpc_xprt_switch_add_xprt(xprt_switch, new); + xprt_put(new); + +out_put_xprt: + xprt_put(xprt); +out: + xprt_switch_put(xprt_switch); + return count; +} + static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -252,7 +390,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, saddr = (struct sockaddr *)&xprt->addr; port = rpc_get_port(saddr); - /* buf_len is the len until the first occurence of either + /* buf_len is the len until the first occurrence of either * '\n' or '\0' */ buf_len = strcspn(buf, "\n"); @@ -260,7 +398,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, dst_addr = kstrndup(buf, buf_len, GFP_KERNEL); if (!dst_addr) goto out_err; - saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL); + saved_addr = kzalloc_obj(*saved_addr); if (!saved_addr) goto out_err_free; saved_addr->addr = @@ -335,6 +473,40 @@ out_put: return count; } +static ssize_t rpc_sysfs_xprt_del_xprt(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); + + if (!xprt || !xps) { + count = 0; + goto out; + } + + if (xprt->main) { + count = -EINVAL; + goto release_tasks; + } + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + count = -EINTR; + goto out_put; + } + + xprt_set_offline_locked(xprt, xps); + xprt_delete_locked(xprt, xps); + +release_tasks: + xprt_release_write(xprt, NULL); +out_put: + xprt_put(xprt); + xprt_switch_put(xps); +out: + return count; +} + int rpc_sysfs_init(void) { rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); @@ -382,39 +554,66 @@ static void rpc_sysfs_xprt_release(struct kobject *kobj) kfree(xprt); } -static const void *rpc_sysfs_client_namespace(const struct kobject *kobj) +static const struct ns_common *rpc_sysfs_client_namespace(const struct kobject *kobj) { - return container_of(kobj, struct rpc_sysfs_client, kobject)->net; + return to_ns_common(container_of(kobj, struct rpc_sysfs_client, + kobject)->net); } -static const void *rpc_sysfs_xprt_switch_namespace(const struct kobject *kobj) +static const struct ns_common *rpc_sysfs_xprt_switch_namespace(const struct kobject *kobj) { - return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net; + return to_ns_common(container_of(kobj, struct rpc_sysfs_xprt_switch, + kobject)->net); } -static const void *rpc_sysfs_xprt_namespace(const struct kobject *kobj) +static const struct ns_common *rpc_sysfs_xprt_namespace(const struct kobject *kobj) { - return container_of(kobj, struct rpc_sysfs_xprt, - kobject)->xprt->xprt_net; + return to_ns_common(container_of(kobj, struct rpc_sysfs_xprt, + kobject)->xprt->xprt_net); } +static struct kobj_attribute rpc_sysfs_clnt_version = __ATTR(rpc_version, + 0444, rpc_sysfs_clnt_version_show, NULL); + +static struct kobj_attribute rpc_sysfs_clnt_program = __ATTR(program, + 0444, rpc_sysfs_clnt_program_show, NULL); + +static struct kobj_attribute rpc_sysfs_clnt_max_connect = __ATTR(max_connect, + 0444, rpc_sysfs_clnt_max_connect_show, NULL); + +static struct attribute *rpc_sysfs_rpc_clnt_attrs[] = { + &rpc_sysfs_clnt_version.attr, + &rpc_sysfs_clnt_program.attr, + &rpc_sysfs_clnt_max_connect.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rpc_sysfs_rpc_clnt); + static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, 0644, rpc_sysfs_xprt_srcaddr_show, NULL); +static struct kobj_attribute rpc_sysfs_xprt_xprtsec = __ATTR(xprtsec, + 0644, rpc_sysfs_xprt_xprtsec_show, NULL); + static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change); +static struct kobj_attribute rpc_sysfs_xprt_del = __ATTR(del_xprt, + 0644, rpc_sysfs_xprt_del_xprt_show, rpc_sysfs_xprt_del_xprt); + static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, &rpc_sysfs_xprt_srcaddr.attr, + &rpc_sysfs_xprt_xprtsec.attr, &rpc_sysfs_xprt_info.attr, &rpc_sysfs_xprt_change_state.attr, + &rpc_sysfs_xprt_del.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt); @@ -422,14 +621,20 @@ ATTRIBUTE_GROUPS(rpc_sysfs_xprt); static struct kobj_attribute rpc_sysfs_xprt_switch_info = __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); +static struct kobj_attribute rpc_sysfs_xprt_switch_add_xprt = + __ATTR(add_xprt, 0644, rpc_sysfs_xprt_switch_add_xprt_show, + rpc_sysfs_xprt_switch_add_xprt_store); + static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { &rpc_sysfs_xprt_switch_info.attr, + &rpc_sysfs_xprt_switch_add_xprt.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch); static const struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, + .default_groups = rpc_sysfs_rpc_clnt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_client_namespace, }; @@ -461,7 +666,7 @@ static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent, { struct rpc_sysfs_client *p; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc_obj(*p); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; @@ -481,7 +686,7 @@ rpc_sysfs_xprt_switch_alloc(struct kobject *parent, { struct rpc_sysfs_xprt_switch *p; - p = kzalloc(sizeof(*p), gfp_flags); + p = kzalloc_obj(*p, gfp_flags); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; @@ -501,7 +706,7 @@ static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent, { struct rpc_sysfs_xprt *p; - p = kzalloc(sizeof(*p), gfp_flags); + p = kzalloc_obj(*p, gfp_flags); if (!p) goto out; p->kobject.kset = rpc_sunrpc_kset; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 4e003cb516fe..e83d5d0be78b 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -37,19 +37,6 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj) } EXPORT_SYMBOL_GPL(xdr_encode_netobj); -__be32 * -xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj) -{ - unsigned int len; - - if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ) - return NULL; - obj->len = len; - obj->data = (u8 *) p; - return p + XDR_QUADLEN(len); -} -EXPORT_SYMBOL_GPL(xdr_decode_netobj); - /** * xdr_encode_opaque_fixed - Encode fixed length opaque data * @p: pointer to current position in XDR buffer. @@ -102,21 +89,6 @@ xdr_encode_string(__be32 *p, const char *string) } EXPORT_SYMBOL_GPL(xdr_encode_string); -__be32 * -xdr_decode_string_inplace(__be32 *p, char **sp, - unsigned int *lenp, unsigned int maxlen) -{ - u32 len; - - len = be32_to_cpu(*p++); - if (len > maxlen) - return NULL; - *lenp = len; - *sp = (char *) p; - return p + XDR_QUADLEN(len); -} -EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); - /** * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf * @buf: XDR buffer where string resides @@ -146,7 +118,7 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) size_t i, n = xdr_buf_pagecount(buf); if (n != 0 && buf->bvec == NULL) { - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); + buf->bvec = kmalloc_objs(buf->bvec[0], n, gfp); if (!buf->bvec) return -ENOMEM; for (i = 0; i < n; i++) { @@ -213,6 +185,7 @@ bvec_overflow: pr_warn_once("%s: bio_vec array overflow\n", __func__); return count - 1; } +EXPORT_SYMBOL_GPL(xdr_buf_to_bvec); /** * xdr_inline_pages - Prepare receive buffer for a large reply @@ -992,21 +965,18 @@ EXPORT_SYMBOL_GPL(xdr_init_encode); * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer into which to encode data - * @pages: list of pages to decode into - * @rqst: pointer to controlling rpc_rqst, for debugging * */ -void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, - struct page **pages, struct rpc_rqst *rqst) +void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf) { xdr_reset_scratch_buffer(xdr); xdr->buf = buf; - xdr->page_ptr = pages; + xdr->page_ptr = buf->pages; xdr->iov = NULL; - xdr->p = page_address(*pages); + xdr->p = page_address(*xdr->page_ptr); xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); - xdr->rqst = rqst; + xdr->rqst = NULL; } EXPORT_SYMBOL_GPL(xdr_init_encode_pages); @@ -2247,88 +2217,6 @@ out: EXPORT_SYMBOL_GPL(xdr_process_buf); /** - * xdr_stream_decode_opaque - Decode variable length opaque - * @xdr: pointer to xdr_stream - * @ptr: location to store opaque data - * @size: size of storage buffer @ptr - * - * Return values: - * On success, returns size of object stored in *@ptr - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE on overflow of storage buffer @ptr - */ -ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, size); - if (ret <= 0) - return ret; - memcpy(ptr, p, ret); - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque); - -/** - * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque - * @xdr: pointer to xdr_stream - * @ptr: location to store pointer to opaque data - * @maxlen: maximum acceptable object size - * @gfp_flags: GFP mask to use - * - * Return values: - * On success, returns size of object stored in *@ptr - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE if the size of the object would exceed @maxlen - * %-ENOMEM on memory allocation failure - */ -ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, - size_t maxlen, gfp_t gfp_flags) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); - if (ret > 0) { - *ptr = kmemdup(p, ret, gfp_flags); - if (*ptr != NULL) - return ret; - ret = -ENOMEM; - } - *ptr = NULL; - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup); - -/** - * xdr_stream_decode_string - Decode variable length string - * @xdr: pointer to xdr_stream - * @str: location to store string - * @size: size of storage buffer @str - * - * Return values: - * On success, returns length of NUL-terminated string stored in *@str - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE on overflow of storage buffer @str - */ -ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, size); - if (ret > 0) { - memcpy(str, p, ret); - str[ret] = '\0'; - return strlen(str); - } - *str = '\0'; - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_string); - -/** * xdr_stream_decode_string_dup - Decode and duplicate variable length string * @xdr: pointer to xdr_stream * @str: location to store pointer to string diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 09f245cda526..48a3618cbb29 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -854,7 +854,7 @@ xprt_schedule_autodisconnect(struct rpc_xprt *xprt) static void xprt_init_autodisconnect(struct timer_list *t) { - struct rpc_xprt *xprt = from_timer(xprt, t, timer); + struct rpc_xprt *xprt = timer_container_of(xprt, t, timer); if (!RB_EMPTY_ROOT(&xprt->recv_queue)) return; @@ -1167,7 +1167,7 @@ xprt_request_enqueue_receive(struct rpc_task *task) spin_unlock(&xprt->queue_lock); /* Turn off autodisconnect */ - del_timer_sync(&xprt->timer); + timer_delete_sync(&xprt->timer); return 0; } @@ -1365,7 +1365,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task) INIT_LIST_HEAD(&req->rq_xmit2); goto out; } - } else if (!req->rq_seqno) { + } else if (req->rq_seqno_count == 0) { list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_task->tk_owner != task->tk_owner) continue; @@ -1663,6 +1663,22 @@ void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_add_backlog); +/** + * xprt_add_backlog_noncongested - queue task on backlog + * @xprt: transport whose backlog queue receives the task + * @task: task to queue + * + * Like xprt_add_backlog, but does not set XPRT_CONGESTED. + * For transports whose free_slot path does not synchronize + * with xprt_throttle_congested via reserve_lock. + */ +void xprt_add_backlog_noncongested(struct rpc_xprt *xprt, + struct rpc_task *task) +{ + rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init); +} +EXPORT_SYMBOL_GPL(xprt_add_backlog_noncongested); + static bool __xprt_set_rq(struct rpc_task *task, void *data) { struct rpc_rqst *req = data; @@ -1709,7 +1725,7 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) goto out; ++xprt->num_reqs; spin_unlock(&xprt->reserve_lock); - req = kzalloc(sizeof(*req), rpc_task_gfp_mask()); + req = kzalloc_obj(*req, rpc_task_gfp_mask()); spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; @@ -1829,7 +1845,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, xprt_init(xprt, net); for (i = 0; i < num_prealloc; i++) { - req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL); + req = kzalloc_obj(struct rpc_rqst); if (!req) goto out_free; list_add(&req->rq_list, &xprt->free); @@ -1898,6 +1914,7 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.bvec = NULL; req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; + req->rq_seqno_count = 0; xprt_init_majortimeo(task, req, task->tk_client->cl_timeout); trace_xprt_reserve(req); @@ -2138,7 +2155,7 @@ static void xprt_destroy(struct rpc_xprt *xprt) * can only run *before* del_time_sync(), never after. */ spin_lock(&xprt->transport_lock); - del_timer_sync(&xprt->timer); + timer_delete_sync(&xprt->timer); spin_unlock(&xprt->transport_lock); /* diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 7e98d4dd9f10..3ba818d637be 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -92,6 +92,27 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, xprt_put(xprt); } +/** + * rpc_xprt_switch_get_main_xprt - Get the 'main' xprt for an xprt switch. + * @xps: pointer to struct rpc_xprt_switch. + */ +struct rpc_xprt *rpc_xprt_switch_get_main_xprt(struct rpc_xprt_switch *xps) +{ + struct rpc_xprt_iter xpi; + struct rpc_xprt *xprt; + + xprt_iter_init_listall(&xpi, xps); + + xprt = xprt_iter_get_next(&xpi); + while (xprt && !xprt->main) { + xprt_put(xprt); + xprt = xprt_iter_get_next(&xpi); + } + + xprt_iter_destroy(&xpi); + return xprt; +} + static DEFINE_IDA(rpc_xprtswitch_ids); void xprt_multipath_cleanup_ids(void) @@ -129,7 +150,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, { struct rpc_xprt_switch *xps; - xps = kmalloc(sizeof(*xps), gfp_flags); + xps = kmalloc_obj(*xps, gfp_flags); if (xps != NULL) { spin_lock_init(&xps->xps_lock); kref_init(&xps->xps_kref); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 8c817e755262..2f0f9618dd05 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -9,6 +9,7 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/bc_xprt.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> @@ -220,7 +221,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct svc_serv *bc_serv; struct rpcrdma_req *req; struct rpc_rqst *rqst; struct xdr_buf *buf; @@ -261,11 +261,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, trace_xprtrdma_cb_call(r_xprt, rqst); /* Queue rqst for ULP's callback service */ - bc_serv = xprt->bc_serv; - xprt_get(xprt); - lwq_enqueue(&rqst->rq_bc_list, &bc_serv->sv_cb_list); - - svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]); + xprt_enqueue_bc_request(rqst); r_xprt->rx_stats.bcall_count++; return; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 31434aeb8e29..7f79a0a2601e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -244,9 +244,10 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) } ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->re_recv_batch = ep->re_max_requests >> 2; ep->re_attr.cap.max_recv_wr = ep->re_max_requests; ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH; + ep->re_attr.cap.max_recv_wr += ep->re_recv_batch; ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ ep->re_max_rdma_segs = @@ -268,10 +269,9 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) } /** - * frwr_map - Register a memory region + * frwr_map - Register a memory region from an xdr_buf cursor * @r_xprt: controlling transport - * @seg: memory region co-ordinates - * @nsegs: number of segments remaining + * @cur: cursor tracking position within the xdr_buf * @writing: true when RDMA Write will be used * @xid: XID of RPC using the registered memory * @mr: MR to fill in @@ -279,34 +279,104 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) * Prepare a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. * - * Returns the next segment or a negative errno pointer. - * On success, @mr is filled in. + * Returns 0 on success (cursor advanced past consumed data, + * @mr populated) or a negative errno on failure. */ -struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr *mr) +int frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_xdr_cursor *cur, + bool writing, __be32 xid, + struct rpcrdma_mr *mr) { struct rpcrdma_ep *ep = r_xprt->rx_ep; + const struct xdr_buf *xdrbuf = cur->xc_buf; + bool sg_gaps = ep->re_mrtype == IB_MR_TYPE_SG_GAPS; + unsigned int max_depth = ep->re_max_fr_depth; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; u8 key; - if (nsegs > ep->re_max_fr_depth) - nsegs = ep->re_max_fr_depth; - for (i = 0; i < nsegs;) { - sg_set_page(&mr->mr_sg[i], seg->mr_page, - seg->mr_len, seg->mr_offset); - - ++seg; - ++i; - if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) - continue; - if ((i < nsegs && seg->mr_offset) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; + i = 0; + + /* Head kvec */ + if (!(cur->xc_flags & XC_HEAD_DONE)) { + const struct kvec *head = &xdrbuf->head[0]; + + sg_set_page(&mr->mr_sg[i], + virt_to_page(head->iov_base), + head->iov_len, + offset_in_page(head->iov_base)); + cur->xc_flags |= XC_HEAD_DONE; + i++; + /* Without sg-gap support, each non-contiguous region + * must be registered as a separate MR. Returning + * here after the head kvec causes the caller to + * invoke frwr_map() again for the page list and + * tail. + */ + if (!sg_gaps) + goto finish; } + + /* Page list */ + if (!(cur->xc_flags & XC_PAGES_DONE) && xdrbuf->page_len) { + unsigned int page_base, remaining; + struct page **ppages; + + remaining = xdrbuf->page_len - cur->xc_page_offset; + page_base = offset_in_page(xdrbuf->page_base + + cur->xc_page_offset); + ppages = xdrbuf->pages + + ((xdrbuf->page_base + cur->xc_page_offset) + >> PAGE_SHIFT); + + while (remaining > 0 && i < max_depth) { + unsigned int len; + + len = min_t(unsigned int, + PAGE_SIZE - page_base, remaining); + sg_set_page(&mr->mr_sg[i], *ppages, + len, page_base); + cur->xc_page_offset += len; + i++; + ppages++; + remaining -= len; + + if (!sg_gaps && remaining > 0 && + offset_in_page(page_base + len)) + goto finish; + page_base = 0; + } + if (remaining == 0) + cur->xc_flags |= XC_PAGES_DONE; + } else if (!(cur->xc_flags & XC_PAGES_DONE)) { + cur->xc_flags |= XC_PAGES_DONE; + } + + /* Tail kvec */ + if (!(cur->xc_flags & XC_TAIL_DONE) && xdrbuf->tail[0].iov_len && + i < max_depth) { + const struct kvec *tail = &xdrbuf->tail[0]; + + if (!sg_gaps && i > 0) { + struct scatterlist *prev = &mr->mr_sg[i - 1]; + + if (offset_in_page(prev->offset + prev->length) || + offset_in_page(tail->iov_base)) + goto finish; + } + sg_set_page(&mr->mr_sg[i], + virt_to_page(tail->iov_base), + tail->iov_len, + offset_in_page(tail->iov_base)); + cur->xc_flags |= XC_TAIL_DONE; + i++; + } else if (!(cur->xc_flags & XC_TAIL_DONE) && + !xdrbuf->tail[0].iov_len) { + cur->xc_flags |= XC_TAIL_DONE; + } + +finish: mr->mr_dir = rpcrdma_data_dir(writing); mr->mr_nents = i; @@ -338,15 +408,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_offset = ibmr->iova; trace_xprtrdma_mr_map(mr); - return seg; + return 0; out_dmamap_err: trace_xprtrdma_frwr_sgerr(mr, i); - return ERR_PTR(-EIO); + return -EIO; out_mapmr_err: trace_xprtrdma_frwr_maperr(mr, n); - return ERR_PTR(-EIO); + return -EIO; } /** @@ -669,9 +739,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ int frwr_wp_create(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = r_xprt->rx_ep; - struct rpcrdma_mr_seg seg; + struct ib_reg_wr *reg_wr; struct rpcrdma_mr *mr; + struct ib_mr *ibmr; + int dma_nents; + int ret; mr = rpcrdma_mr_get(r_xprt); if (!mr) @@ -679,11 +753,39 @@ int frwr_wp_create(struct rpcrdma_xprt *r_xprt) mr->mr_req = NULL; ep->re_write_pad_mr = mr; - seg.mr_len = XDR_UNIT; - seg.mr_page = virt_to_page(ep->re_write_pad); - seg.mr_offset = offset_in_page(ep->re_write_pad); - if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr))) - return -EIO; + sg_init_table(mr->mr_sg, 1); + sg_set_page(mr->mr_sg, virt_to_page(ep->re_write_pad), + XDR_UNIT, offset_in_page(ep->re_write_pad)); + + mr->mr_dir = DMA_FROM_DEVICE; + mr->mr_nents = 1; + dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, + mr->mr_nents, mr->mr_dir); + if (!dma_nents) { + ret = -EIO; + goto out_mr; + } + mr->mr_device = ep->re_id->device; + + ibmr = mr->mr_ibmr; + if (ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, + PAGE_SIZE) != dma_nents) { + ret = -EIO; + goto out_unmap; + } + + /* IOVA is not tagged with an XID; the write-pad is not RPC-specific. */ + ib_update_fast_reg_key(ibmr, ib_inc_rkey(ibmr->rkey)); + + reg_wr = &mr->mr_regwr; + reg_wr->mr = ibmr; + reg_wr->key = ibmr->rkey; + reg_wr->access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; + + mr->mr_handle = ibmr->rkey; + mr->mr_length = ibmr->length; + mr->mr_offset = ibmr->iova; + trace_xprtrdma_mr_fastreg(mr); mr->mr_cqe.done = frwr_wc_fastreg; @@ -693,5 +795,16 @@ int frwr_wp_create(struct rpcrdma_xprt *r_xprt) mr->mr_regwr.wr.opcode = IB_WR_REG_MR; mr->mr_regwr.wr.send_flags = 0; - return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); + ret = ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); + if (!ret) + return 0; + +out_unmap: + frwr_mr_unmap(mr); +out_mr: + ep->re_write_pad_mr = NULL; + spin_lock(&buf->rb_lock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_lock); + return ret; } diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c index 28c68b5f6823..de49ad02053d 100644 --- a/net/sunrpc/xprtrdma/ib_client.c +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -108,7 +108,7 @@ static int rpcrdma_add_one(struct ib_device *device) { struct rpcrdma_device *rd; - rd = kzalloc(sizeof(*rd), GFP_KERNEL); + rd = kzalloc_obj(*rd); if (!rd) return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 1478c41c7e9d..0e0f21974710 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -190,7 +190,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); while (len > 0) { if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + *ppages = alloc_page(GFP_NOWAIT); if (!*ppages) return -ENOBUFS; ppages++; @@ -200,67 +200,30 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) return 0; } -/* Convert @vec to a single SGL element. - * - * Returns pointer to next available SGE, and bumps the total number - * of SGEs consumed. - */ -static struct rpcrdma_mr_seg * -rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, - unsigned int *n) +static void +rpcrdma_xdr_cursor_init(struct rpcrdma_xdr_cursor *cur, + const struct xdr_buf *xdrbuf, + unsigned int pos, enum rpcrdma_chunktype type) { - seg->mr_page = virt_to_page(vec->iov_base); - seg->mr_offset = offset_in_page(vec->iov_base); - seg->mr_len = vec->iov_len; - ++seg; - ++(*n); - return seg; + cur->xc_buf = xdrbuf; + cur->xc_page_offset = 0; + cur->xc_flags = 0; + + if (pos != 0) + cur->xc_flags |= XC_HEAD_DONE; + if (!xdrbuf->page_len) + cur->xc_flags |= XC_PAGES_DONE; + if (type == rpcrdma_readch || type == rpcrdma_writech || + !xdrbuf->tail[0].iov_len) + cur->xc_flags |= XC_TAIL_DONE; } -/* Convert @xdrbuf into SGEs no larger than a page each. As they - * are registered, these SGEs are then coalesced into RDMA segments - * when the selected memreg mode supports it. - * - * Returns positive number of SGEs consumed, or a negative errno. - */ - -static int -rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, - unsigned int pos, enum rpcrdma_chunktype type, - struct rpcrdma_mr_seg *seg) +static bool +rpcrdma_xdr_cursor_done(const struct rpcrdma_xdr_cursor *cur) { - unsigned long page_base; - unsigned int len, n; - struct page **ppages; - - n = 0; - if (pos == 0) - seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); - - len = xdrbuf->page_len; - ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); - page_base = offset_in_page(xdrbuf->page_base); - while (len) { - seg->mr_page = *ppages; - seg->mr_offset = page_base; - seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); - len -= seg->mr_len; - ++ppages; - ++seg; - ++n; - page_base = 0; - } - - if (type == rpcrdma_readch || type == rpcrdma_writech) - goto out; - - if (xdrbuf->tail[0].iov_len) - rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); - -out: - if (unlikely(n > RPCRDMA_MAX_SEGS)) - return -EIO; - return n; + return (cur->xc_flags & (XC_HEAD_DONE | XC_PAGES_DONE | + XC_TAIL_DONE)) == + (XC_HEAD_DONE | XC_PAGES_DONE | XC_TAIL_DONE); } static int @@ -292,11 +255,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return 0; } -static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, - struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, - struct rpcrdma_mr **mr) +static int rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpcrdma_xdr_cursor *cur, + bool writing, struct rpcrdma_mr **mr) { *mr = rpcrdma_mr_pop(&req->rl_free_mrs); if (!*mr) { @@ -307,13 +269,13 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, } rpcrdma_mr_push(*mr, &req->rl_registered); - return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); + return frwr_map(r_xprt, cur, writing, req->rl_slot.rq_xid, *mr); out_getmr_err: trace_xprtrdma_nomrs_err(r_xprt, req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); rpcrdma_mrs_refresh(r_xprt); - return ERR_PTR(-EAGAIN); + return -EAGAIN; } /* Register and XDR encode the Read list. Supports encoding a list of read @@ -336,10 +298,10 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype rtype) { struct xdr_stream *xdr = &req->rl_stream; - struct rpcrdma_mr_seg *seg; + struct rpcrdma_xdr_cursor cur; struct rpcrdma_mr *mr; unsigned int pos; - int nsegs; + int ret; if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) goto done; @@ -347,24 +309,20 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) pos = 0; - seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, - rtype, seg); - if (nsegs < 0) - return nsegs; + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_snd_buf, pos, rtype); do { - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); - if (IS_ERR(seg)) - return PTR_ERR(seg); + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, false, &mr); + if (ret) + return ret; if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; - trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); + trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, + rpcrdma_xdr_cursor_done(&cur)); r_xprt->rx_stats.read_chunk_count++; - nsegs -= mr->mr_nents; - } while (nsegs); + } while (!rpcrdma_xdr_cursor_done(&cur)); done: if (xdr_stream_encode_item_absent(xdr) < 0) @@ -394,20 +352,16 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_ep *ep = r_xprt->rx_ep; - struct rpcrdma_mr_seg *seg; + struct rpcrdma_xdr_cursor cur; struct rpcrdma_mr *mr; - int nsegs, nchunks; + int nchunks, ret; __be32 *segcount; if (wtype != rpcrdma_writech) goto done; - seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, - rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg); - if (nsegs < 0) - return nsegs; + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, + rqst->rq_rcv_buf.head[0].iov_len, wtype); if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; @@ -418,30 +372,30 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, nchunks = 0; do { - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); - if (IS_ERR(seg)) - return PTR_ERR(seg); + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr); + if (ret) + return ret; if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_write(rqst->rq_task, mr, + rpcrdma_xdr_cursor_done(&cur)); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mr->mr_nents; - } while (nsegs); + } while (!rpcrdma_xdr_cursor_done(&cur)); if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) return -EMSGSIZE; trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, - nsegs); + true); r_xprt->rx_stats.write_chunk_count++; - r_xprt->rx_stats.total_rdma_request += mr->mr_length; + r_xprt->rx_stats.total_rdma_request += + ep->re_write_pad_mr->mr_length; nchunks++; - nsegs -= mr->mr_nents; } /* Update count of segments in this Write chunk */ @@ -471,9 +425,9 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; - struct rpcrdma_mr_seg *seg; + struct rpcrdma_xdr_cursor cur; struct rpcrdma_mr *mr; - int nsegs, nchunks; + int nchunks, ret; __be32 *segcount; if (wtype != rpcrdma_replych) { @@ -482,10 +436,7 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, return 0; } - seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); - if (nsegs < 0) - return nsegs; + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, 0, wtype); if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; @@ -496,19 +447,19 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, nchunks = 0; do { - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); - if (IS_ERR(seg)) - return PTR_ERR(seg); + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr); + if (ret) + return ret; if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_reply(rqst->rq_task, mr, + rpcrdma_xdr_cursor_done(&cur)); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mr->mr_nents; - } while (nsegs); + } while (!rpcrdma_xdr_cursor_done(&cur)); /* Update count of segments in the Reply chunk */ *segcount = cpu_to_be32(nchunks); @@ -1471,7 +1422,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_ep->re_max_requests) credits = r_xprt->rx_ep->re_max_requests; - rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1)); if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); @@ -1490,15 +1440,20 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) /* LocalInv completion will complete the RPC */ else kref_put(&req->rl_kref, rpcrdma_reply_done); - return; -out_badversion: - trace_xprtrdma_reply_vers_err(rep); - goto out; +out_post: + rpcrdma_post_recvs(r_xprt, + credits + (buf->rb_bc_srv_max_requests << 1)); + return; out_norqst: spin_unlock(&xprt->queue_lock); trace_xprtrdma_reply_rqst_err(rep); + rpcrdma_rep_put(buf, rep); + goto out_post; + +out_badversion: + trace_xprtrdma_reply_vers_err(rep); goto out; out_shortreply: diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c index b63cfeaa2923..1f8f7dad8b6f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_pcl.c +++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c @@ -29,7 +29,7 @@ static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position) { struct svc_rdma_chunk *chunk; - chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL); + chunk = kmalloc_flex(*chunk, ch_segments, segcount); if (!chunk) return NULL; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 292022f0976e..f8a0638eb095 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -118,20 +118,25 @@ svc_rdma_next_recv_ctxt(struct list_head *list) static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { - int node = ibdev_to_node(rdma->sc_cm_id->device); + struct ib_device *device = rdma->sc_cm_id->device; + int node = ibdev_to_node(device); struct svc_rdma_recv_ctxt *ctxt; + unsigned long pages; dma_addr_t addr; void *buffer; - ctxt = kzalloc_node(sizeof(*ctxt), GFP_KERNEL, node); + pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server); + ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages), + GFP_KERNEL, node); if (!ctxt) goto fail0; + ctxt->rc_maxpages = pages; buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; - addr = ib_dma_map_single(rdma->sc_pd->device, buffer, - rdma->sc_max_req_size, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device, addr)) goto fail2; svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); @@ -163,7 +168,7 @@ fail0: static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { - ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr, + ib_dma_unmap_single(rdma->sc_cm_id->device, ctxt->rc_recv_sge.addr, ctxt->rc_recv_sge.length, DMA_FROM_DEVICE); kfree(ctxt->rc_recv_buf); kfree(ctxt); @@ -497,7 +502,7 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) * a computation, perform a simple range check. This is an * arbitrary but sensible limit (ie, not architectural). */ - if (unlikely(segcount > RPCSVC_MAXPAGES)) + if (unlikely(segcount > rctxt->rc_maxpages)) return false; p = xdr_inline_decode(&rctxt->rc_stream, @@ -857,18 +862,12 @@ static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, unsigned int i; /* Transfer the Read chunk pages into @rqstp.rq_pages, replacing - * the rq_pages that were already allocated for this rqstp. + * the receive buffer pages already allocated for this rqstp. */ - release_pages(rqstp->rq_respages, ctxt->rc_page_count); + release_pages(rqstp->rq_pages, ctxt->rc_page_count); for (i = 0; i < ctxt->rc_page_count; i++) rqstp->rq_pages[i] = ctxt->rc_pages[i]; - /* Update @rqstp's result send buffer to start after the - * last page in the RDMA Read payload. - */ - rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - /* Prevent svc_rdma_recv_ctxt_put() from releasing the * pages in ctxt::rc_pages a second time. */ @@ -927,10 +926,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_recv_ctxt *ctxt; int ret; - /* Prevent svc_xprt_release() from releasing pages in rq_pages - * when returning 0 or an error. + /* Precaution: a zero page count on error return causes + * svc_rqst_release_pages() to release nothing. */ - rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_next_page = rqstp->rq_respages; rqstp->rq_xprt_ctxt = NULL; @@ -958,7 +956,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return 0; percpu_counter_inc(&svcrdma_stat_recv); - ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device, + ib_dma_sync_single_for_cpu(rdma_xprt->sc_cm_id->device, ctxt->rc_recv_sge.addr, ctxt->rc_byte_len, DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 40797114d50a..402e2ceca4ff 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -5,6 +5,8 @@ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. */ +#include <linux/bvec.h> +#include <linux/overflow.h> #include <rdma/rw.h> #include <linux/sunrpc/xdr.h> @@ -20,30 +22,33 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); /* Each R/W context contains state for one chain of RDMA Read or * Write Work Requests. * - * Each WR chain handles a single contiguous server-side buffer, - * because scatterlist entries after the first have to start on - * page alignment. xdr_buf iovecs cannot guarantee alignment. + * Each WR chain handles a single contiguous server-side buffer. + * - each xdr_buf iovec is a single contiguous buffer + * - the xdr_buf pages array is a single contiguous buffer because the + * second through the last element always start on a page boundary * * Each WR chain handles only one R_key. Each RPC-over-RDMA segment * from a client may contain a unique R_key, so each WR chain moves * up to one segment at a time. * - * The scatterlist makes this data structure over 4KB in size. To - * make it less likely to fail, and to handle the allocation for - * smaller I/O requests without disabling bottom-halves, these - * contexts are created on demand, but cached and reused until the - * controlling svcxprt_rdma is destroyed. + * The inline bvec array is sized to handle most I/O requests without + * additional allocation. Larger requests fall back to dynamic allocation. + * These contexts are created on demand, but cached and reused until + * the controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; - unsigned int rw_first_sgl_nents; - struct sg_table rw_sg_table; - struct scatterlist rw_first_sgl[]; + unsigned int rw_first_bvec_nents; + struct bio_vec *rw_bvec; + struct bio_vec rw_first_bvec[]; }; +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt); + static inline struct svc_rdma_rw_ctxt * svc_rdma_next_ctxt(struct list_head *list) { @@ -52,10 +57,10 @@ svc_rdma_next_ctxt(struct list_head *list) } static struct svc_rdma_rw_ctxt * -svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) +svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec) { struct ib_device *dev = rdma->sc_cm_id->device; - unsigned int first_sgl_nents = dev->attrs.max_send_sge; + unsigned int first_bvec_nents = dev->attrs.max_send_sge; struct svc_rdma_rw_ctxt *ctxt; struct llist_node *node; @@ -65,33 +70,44 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) if (node) { ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents), + ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec, + first_bvec_nents), GFP_KERNEL, ibdev_to_node(dev)); if (!ctxt) goto out_noctx; INIT_LIST_HEAD(&ctxt->rw_list); - ctxt->rw_first_sgl_nents = first_sgl_nents; + ctxt->rw_first_bvec_nents = first_bvec_nents; } - ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; - if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, - ctxt->rw_sg_table.sgl, - first_sgl_nents)) - goto out_free; + if (nr_bvec <= ctxt->rw_first_bvec_nents) { + ctxt->rw_bvec = ctxt->rw_first_bvec; + } else { + ctxt->rw_bvec = kmalloc_array_node(nr_bvec, + sizeof(*ctxt->rw_bvec), + GFP_KERNEL, + ibdev_to_node(dev)); + if (!ctxt->rw_bvec) + goto out_free; + } return ctxt; out_free: - kfree(ctxt); + /* Return cached contexts to cache; free freshly allocated ones */ + if (node) + svc_rdma_put_rw_ctxt(rdma, ctxt); + else + kfree(ctxt); out_noctx: - trace_svcrdma_rwctx_empty(rdma, sges); + trace_svcrdma_rwctx_empty(rdma, nr_bvec); return NULL; } static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, struct llist_head *list) { - sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents); + if (ctxt->rw_bvec != ctxt->rw_first_bvec) + kfree(ctxt->rw_bvec); llist_add(&ctxt->rw_node, list); } @@ -123,6 +139,7 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) * @ctxt: R/W context to prepare * @offset: RDMA offset * @handle: RDMA tag/handle + * @length: total number of bytes in the bvec array * @direction: I/O direction * * Returns on success, the number of WQEs that will be needed @@ -130,14 +147,18 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) */ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, struct svc_rdma_rw_ctxt *ctxt, - u64 offset, u32 handle, + u64 offset, u32 handle, unsigned int length, enum dma_data_direction direction) { + struct bvec_iter iter = { + .bi_size = length, + }; int ret; - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, - ctxt->rw_sg_table.sgl, ctxt->rw_nents, - 0, offset, handle, direction); + ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, + ctxt->rw_bvec, ctxt->rw_nents, + iter, offset, handle, direction); if (unlikely(ret < 0)) { trace_svcrdma_dma_map_rw_err(rdma, offset, handle, ctxt->rw_nents, ret); @@ -175,7 +196,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, { struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; - LLIST_HEAD(free); trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); @@ -183,10 +203,11 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); - rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, ctxt->rw_sg_table.sgl, - ctxt->rw_nents, dir); - __svc_rdma_put_rw_ctxt(ctxt, &free); + rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, + ctxt->rw_bvec, ctxt->rw_nents, dir); + if (ctxt->rw_bvec != ctxt->rw_first_bvec) + kfree(ctxt->rw_bvec); ctxt->rw_node.next = first; first = &ctxt->rw_node; @@ -231,6 +252,28 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) } /** + * svc_rdma_write_chunk_release - Release Write chunk I/O resources + * @rdma: controlling transport + * @ctxt: Send context that is being released + * + * Write chunk resources remain live until Send completion because + * Write WRs are chained to the Send WR. This function releases all + * write_info structures accumulated on @ctxt->sc_write_info_list. + */ +void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct svc_rdma_write_info *info; + + while (!list_empty(&ctxt->sc_write_info_list)) { + info = list_first_entry(&ctxt->sc_write_info_list, + struct svc_rdma_write_info, wi_list); + list_del(&info->wi_list); + svc_rdma_write_info_free(info); + } +} + +/** * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources * @rdma: controlling transport * @ctxt: Send context that is being released @@ -286,13 +329,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svc_rdma_write_info *info = - container_of(cc, struct svc_rdma_write_info, wi_cc); switch (wc->status) { case IB_WC_SUCCESS: trace_svcrdma_wc_write(&cc->cc_cid); - break; + return; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); break; @@ -300,12 +341,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write_err(wc, &cc->cc_cid); } - svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); - - if (unlikely(wc->status != IB_WC_SUCCESS)) - svc_xprt_deferred_close(&rdma->sc_xprt); - - svc_rdma_write_info_free(info); + /* The RDMA Write has flushed, so the client won't get + * some of the outgoing RPC message. Signal the loss + * to the client by closing the connection. + */ + svc_xprt_deferred_close(&rdma->sc_xprt); } /** @@ -384,59 +424,39 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, cqe = NULL; } - do { - if (atomic_sub_return(cc->cc_sqecount, - &rdma->sc_sq_avail) > 0) { - cc->cc_posttime = ktime_get(); - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - if (ret) - break; - return 0; - } - - percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); - trace_svcrdma_sq_retry(rdma, &cc->cc_cid); - } while (1); - - trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - - /* If even one was posted, there will be a completion. */ - if (bad_wr != first_wr) - return 0; + ret = svc_rdma_sq_wait(rdma, &cc->cc_cid, cc->cc_sqecount); + if (ret < 0) + return ret; - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - return -ENOTCONN; + cc->cc_posttime = ktime_get(); + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + return svc_rdma_post_send_err(rdma, &cc->cc_cid, bad_wr, + first_wr, cc->cc_sqecount, + ret); + return 0; } -/* Build and DMA-map an SGL that covers one kvec in an xdr_buf +/* Build a bvec that covers one kvec in an xdr_buf. */ -static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, - unsigned int len, - struct svc_rdma_rw_ctxt *ctxt) +static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt) { - struct scatterlist *sg = ctxt->rw_sg_table.sgl; - - sg_set_buf(&sg[0], info->wi_base, len); + bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len); info->wi_base += len; ctxt->rw_nents = 1; } -/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. +/* Build a bvec array that covers part of an xdr_buf's pagelist. */ -static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, - unsigned int remaining, - struct svc_rdma_rw_ctxt *ctxt) +static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info, + unsigned int remaining, + struct svc_rdma_rw_ctxt *ctxt) { - unsigned int sge_no, sge_bytes, page_off, page_no; + unsigned int bvec_idx, bvec_len, page_off, page_no; const struct xdr_buf *xdr = info->wi_xdr; - struct scatterlist *sg; struct page **page; page_off = info->wi_next_off + xdr->page_base; @@ -444,21 +464,19 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, page_off = offset_in_page(page_off); page = xdr->pages + page_no; info->wi_next_off += remaining; - sg = ctxt->rw_sg_table.sgl; - sge_no = 0; + bvec_idx = 0; do { - sge_bytes = min_t(unsigned int, remaining, - PAGE_SIZE - page_off); - sg_set_page(sg, *page, sge_bytes, page_off); - - remaining -= sge_bytes; - sg = sg_next(sg); + bvec_len = min_t(unsigned int, remaining, + PAGE_SIZE - page_off); + bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, bvec_len, + page_off); + remaining -= bvec_len; page_off = 0; - sge_no++; + bvec_idx++; page++; } while (remaining); - ctxt->rw_nents = sge_no; + ctxt->rw_nents = bvec_idx; } /* Construct RDMA Write WRs to send a portion of an xdr_buf containing @@ -496,7 +514,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, constructor(info, write_len, ctxt); offset = seg->rs_offset + info->wi_seg_off; ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, - DMA_TO_DEVICE); + write_len, DMA_TO_DEVICE); if (ret < 0) return -EIO; percpu_counter_inc(&svcrdma_stat_write); @@ -535,7 +553,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info, const struct kvec *iov) { info->wi_base = iov->iov_base; - return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, + return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec, iov->iov_len); } @@ -559,7 +577,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info, { info->wi_xdr = xdr; info->wi_next_off = offset - xdr->head[0].iov_len; - return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, + return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec, length); } @@ -601,9 +619,37 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr) +/* Link chunk WRs onto @sctxt's WR chain. Completion is requested + * for the tail WR, which is posted first. + */ +static void svc_rdma_cc_link_wrs(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_chunk_ctxt *cc) +{ + struct ib_send_wr *first_wr; + struct list_head *pos; + struct ib_cqe *cqe; + + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; + + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; +} + +/* Link Write WRs for @chunk onto @sctxt's WR chain. + */ +static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; @@ -623,10 +669,14 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, if (ret != payload.len) goto out_err; - trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(rdma, cc); - if (ret < 0) + ret = -EINVAL; + if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth)) goto out_err; + + svc_rdma_cc_link_wrs(rdma, sctxt, cc); + list_add(&info->wi_list, &sctxt->sc_write_info_list); + + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); return 0; out_err: @@ -635,17 +685,19 @@ out_err: } /** - * svc_rdma_send_write_list - Send all chunks on the Write list + * svc_rdma_prepare_write_list - Construct WR chain for sending Write list * @rdma: controlling RDMA transport * @rctxt: Write list provisioned by the client + * @sctxt: Send WR resources * @xdr: xdr_buf containing an RPC Reply message * - * Returns zero on success, or a negative errno if one or more - * Write chunks could not be sent. + * Returns zero on success, or a negative errno if WR chain + * construction fails for one or more Write chunks. */ -int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr) +int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr) { struct svc_rdma_chunk *chunk; int ret; @@ -653,7 +705,7 @@ int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { if (!chunk->ch_payload_length) break; - ret = svc_rdma_send_write_chunk(rdma, chunk, xdr); + ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr); if (ret < 0) return ret; } @@ -683,9 +735,6 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, { struct svc_rdma_write_info *info = &sctxt->sc_reply_info; struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; - struct ib_send_wr *first_wr; - struct list_head *pos; - struct ib_cqe *cqe; int ret; info->wi_rdma = rdma; @@ -699,23 +748,222 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, if (ret < 0) return ret; - first_wr = sctxt->sc_wr_chain; - cqe = &cc->cc_cqe; - list_for_each(pos, &cc->cc_rwctxts) { - struct svc_rdma_rw_ctxt *rwc; - - rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); - first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, cqe, first_wr); - cqe = NULL; - } - sctxt->sc_wr_chain = first_wr; - sctxt->sc_sqecount += cc->cc_sqecount; + svc_rdma_cc_link_wrs(rdma, sctxt, cc); trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); return xdr->len; } +/* + * Cap contiguous RDMA Read sink allocations at order-4. + * Higher orders risk allocation failure under + * __GFP_NORETRY, which would negate the benefit of the + * contiguous fast path. + */ +#define SVC_RDMA_CONTIG_MAX_ORDER 4 + +/** + * svc_rdma_alloc_read_pages - Allocate physically contiguous pages + * @nr_pages: number of pages needed + * @order: on success, set to the allocation order + * + * Attempts a higher-order allocation, falling back to smaller orders. + * The returned pages are split immediately so each sub-page has its + * own refcount and can be freed independently. + * + * Returns a pointer to the first page on success, or NULL if even + * order-1 allocation fails. + */ +static struct page * +svc_rdma_alloc_read_pages(unsigned int nr_pages, unsigned int *order) +{ + unsigned int o; + struct page *page; + + o = min(get_order(nr_pages << PAGE_SHIFT), + SVC_RDMA_CONTIG_MAX_ORDER); + + while (o >= 1) { + page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN, + o); + if (page) { + split_page(page, o); + *order = o; + return page; + } + o--; + } + return NULL; +} + +/* + * svc_rdma_fill_contig_bvec - Replace rq_pages with a contiguous allocation + * @rqstp: RPC transaction context + * @head: context for ongoing I/O + * @bv: bvec entry to fill + * @pages_left: number of data pages remaining in the segment + * @len_left: bytes remaining in the segment + * + * On success, fills @bv with a bvec spanning the contiguous range and + * advances rc_curpage/rc_page_count. Returns the byte length covered, + * or zero if the allocation failed or would overrun rq_maxpages. + */ +static unsigned int +svc_rdma_fill_contig_bvec(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, + struct bio_vec *bv, unsigned int pages_left, + unsigned int len_left) +{ + unsigned int order, npages, chunk_pages, chunk_len, i; + struct page *page; + + page = svc_rdma_alloc_read_pages(pages_left, &order); + if (!page) + return 0; + npages = 1 << order; + + if (head->rc_curpage + npages > rqstp->rq_maxpages) { + for (i = 0; i < npages; i++) + __free_page(page + i); + return 0; + } + + /* + * Replace rq_pages[] entries with pages from the contiguous + * allocation. If npages exceeds chunk_pages, the extra pages + * stay in rq_pages[] for later reuse or normal rqst teardown. + */ + for (i = 0; i < npages; i++) { + svc_rqst_page_release(rqstp, + rqstp->rq_pages[head->rc_curpage + i]); + rqstp->rq_pages[head->rc_curpage + i] = page + i; + } + + chunk_pages = min(npages, pages_left); + chunk_len = min_t(unsigned int, chunk_pages << PAGE_SHIFT, len_left); + bvec_set_page(bv, page, chunk_len, 0); + head->rc_page_count += chunk_pages; + head->rc_curpage += chunk_pages; + return chunk_len; +} + +/* + * svc_rdma_fill_page_bvec - Add a single rq_page to the bvec array + * @head: context for ongoing I/O + * @ctxt: R/W context whose bvec array is being filled + * @cur: page to add + * @bvec_idx: pointer to current bvec index, not advanced on merge + * @len_left: bytes remaining in the segment + * + * If @cur is physically contiguous with the preceding bvec, it is + * merged by extending that bvec's length. Otherwise a new bvec + * entry is created. Returns the byte length covered. + */ +static unsigned int +svc_rdma_fill_page_bvec(struct svc_rdma_recv_ctxt *head, + struct svc_rdma_rw_ctxt *ctxt, struct page *cur, + unsigned int *bvec_idx, unsigned int len_left) +{ + unsigned int chunk_len = min_t(unsigned int, PAGE_SIZE, len_left); + + head->rc_page_count++; + head->rc_curpage++; + + if (*bvec_idx > 0) { + struct bio_vec *prev = &ctxt->rw_bvec[*bvec_idx - 1]; + + if (page_to_phys(prev->bv_page) + prev->bv_offset + + prev->bv_len == page_to_phys(cur)) { + prev->bv_len += chunk_len; + return chunk_len; + } + } + + bvec_set_page(&ctxt->rw_bvec[*bvec_idx], cur, chunk_len, 0); + (*bvec_idx)++; + return chunk_len; +} + +/** + * svc_rdma_build_read_segment_contig - Build RDMA Read WR with contiguous pages + * @rqstp: RPC transaction context + * @head: context for ongoing I/O + * @segment: co-ordinates of remote memory to be read + * + * Greedily allocates higher-order pages to cover the segment, + * building one bvec per contiguous chunk. Each allocation is + * split so sub-pages have independent refcounts. When a + * higher-order allocation fails, remaining pages are covered + * individually, merging adjacent pages into the preceding bvec + * when they are physically contiguous. The split sub-pages + * replace entries in rq_pages[] so downstream cleanup is + * unchanged. + * + * Returns: + * %0: the Read WR was constructed successfully + * %-ENOMEM: allocation failed + * %-EIO: a DMA mapping error occurred + */ +static int svc_rdma_build_read_segment_contig(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, + const struct svc_rdma_segment *segment) +{ + struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); + struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; + unsigned int nr_data_pages, bvec_idx; + struct svc_rdma_rw_ctxt *ctxt; + unsigned int len_left; + int ret; + + nr_data_pages = PAGE_ALIGN(segment->rs_length) >> PAGE_SHIFT; + if (head->rc_curpage + nr_data_pages > rqstp->rq_maxpages) + return -ENOMEM; + + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_data_pages); + if (!ctxt) + return -ENOMEM; + + bvec_idx = 0; + len_left = segment->rs_length; + while (len_left) { + unsigned int pages_left = PAGE_ALIGN(len_left) >> PAGE_SHIFT; + unsigned int chunk_len = 0; + + if (pages_left >= 2) + chunk_len = svc_rdma_fill_contig_bvec(rqstp, head, + &ctxt->rw_bvec[bvec_idx], + pages_left, len_left); + if (chunk_len) { + bvec_idx++; + } else { + struct page *cur = + rqstp->rq_pages[head->rc_curpage]; + chunk_len = svc_rdma_fill_page_bvec(head, ctxt, cur, + &bvec_idx, + len_left); + } + + len_left -= chunk_len; + } + + ctxt->rw_nents = bvec_idx; + + head->rc_pageoff = offset_in_page(segment->rs_length); + if (head->rc_pageoff) + head->rc_curpage--; + + ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, + segment->rs_handle, segment->rs_length, + DMA_FROM_DEVICE); + if (ret < 0) + return -EIO; + percpu_counter_inc(&svcrdma_stat_read); + + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; + return 0; +} + /** * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment * @rqstp: RPC transaction context @@ -734,29 +982,37 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, { struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; - unsigned int sge_no, seg_len, len; + unsigned int bvec_idx, nr_bvec, seg_len, len, total; struct svc_rdma_rw_ctxt *ctxt; - struct scatterlist *sg; int ret; len = segment->rs_length; - sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT; - ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); + if (check_add_overflow(head->rc_pageoff, len, &total)) + return -EINVAL; + nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT; + + if (head->rc_pageoff == 0 && nr_bvec >= 2) { + ret = svc_rdma_build_read_segment_contig(rqstp, head, + segment); + if (ret != -ENOMEM) + return ret; + } + + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec); if (!ctxt) return -ENOMEM; - ctxt->rw_nents = sge_no; + ctxt->rw_nents = nr_bvec; - sg = ctxt->rw_sg_table.sgl; - for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { + for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) { seg_len = min_t(unsigned int, len, PAGE_SIZE - head->rc_pageoff); if (!head->rc_pageoff) head->rc_page_count++; - sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], - seg_len, head->rc_pageoff); - sg = sg_next(sg); + bvec_set_page(&ctxt->rw_bvec[bvec_idx], + rqstp->rq_pages[head->rc_curpage], + seg_len, head->rc_pageoff); head->rc_pageoff += seg_len; if (head->rc_pageoff == PAGE_SIZE) { @@ -765,12 +1021,13 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, } len -= seg_len; - if (len && ((head->rc_curpage + 1) > ARRAY_SIZE(rqstp->rq_pages))) + if (len && ((head->rc_curpage + 1) > rqstp->rq_maxpages)) goto out_overrun; } ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, - segment->rs_handle, DMA_FROM_DEVICE); + segment->rs_handle, segment->rs_length, + DMA_FROM_DEVICE); if (ret < 0) return -EIO; percpu_counter_inc(&svcrdma_stat_read); @@ -841,6 +1098,9 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, for (page_no = 0; page_no < numpages; page_no++) { unsigned int page_len; + if (head->rc_curpage >= rqstp->rq_maxpages) + return -EINVAL; + page_len = min_t(unsigned int, remaining, PAGE_SIZE - head->rc_pageoff); @@ -848,7 +1108,7 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, head->rc_page_count++; dst = page_address(rqstp->rq_pages[head->rc_curpage]); - memcpy(dst + head->rc_curpage, src + offset, page_len); + memcpy((unsigned char *)dst + head->rc_pageoff, src + offset, page_len); head->rc_readbytes += page_len; head->rc_pageoff += page_len; @@ -860,7 +1120,7 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, offset += page_len; } - return -EINVAL; + return 0; } /** @@ -1083,10 +1343,16 @@ static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp, { unsigned int i; + /* + * Move only pages containing RPC data into rc_pages[]. Pages + * from a contiguous allocation that were not used for the + * payload remain in rq_pages[] for subsequent reuse. + */ for (i = 0; i < head->rc_page_count; i++) { head->rc_pages[i] = rqstp->rq_pages[i]; rqstp->rq_pages[i] = NULL; } + rqstp->rq_pages_nfree = head->rc_page_count; } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 96154a2367a1..8b3f0c8c14b2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -116,8 +116,10 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { - int node = ibdev_to_node(rdma->sc_cm_id->device); + struct ib_device *device = rdma->sc_cm_id->device; + int node = ibdev_to_node(device); struct svc_rdma_send_ctxt *ctxt; + unsigned long pages; dma_addr_t addr; void *buffer; int i; @@ -126,13 +128,19 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) GFP_KERNEL, node); if (!ctxt) goto fail0; + pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server); + ctxt->sc_pages = kcalloc_node(pages, sizeof(struct page *), + GFP_KERNEL, node); + if (!ctxt->sc_pages) + goto fail1; + ctxt->sc_maxpages = pages; buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) - goto fail1; - addr = ib_dma_map_single(rdma->sc_pd->device, buffer, - rdma->sc_max_req_size, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) goto fail2; + addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device, addr)) + goto fail3; svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); @@ -142,6 +150,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; ctxt->sc_cqe.done = svc_rdma_wc_send; + INIT_LIST_HEAD(&ctxt->sc_write_info_list); ctxt->sc_xprt_buf = buffer; xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, rdma->sc_max_req_size); @@ -151,8 +160,10 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; return ctxt; -fail2: +fail3: kfree(buffer); +fail2: + kfree(ctxt->sc_pages); fail1: kfree(ctxt); fail0: @@ -166,16 +177,16 @@ fail0: */ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) { + struct ib_device *device = rdma->sc_cm_id->device; struct svc_rdma_send_ctxt *ctxt; struct llist_node *node; while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) { ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); - ib_dma_unmap_single(rdma->sc_pd->device, - ctxt->sc_sges[0].addr, - rdma->sc_max_req_size, - DMA_TO_DEVICE); + ib_dma_unmap_single(device, ctxt->sc_sges[0].addr, + rdma->sc_max_req_size, DMA_TO_DEVICE); kfree(ctxt->sc_xprt_buf); + kfree(ctxt->sc_pages); kfree(ctxt); } } @@ -227,6 +238,7 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + svc_rdma_write_chunk_release(rdma, ctxt); svc_rdma_reply_chunk_release(rdma, ctxt); if (ctxt->sc_page_count) @@ -285,6 +297,117 @@ void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) } /** + * svc_rdma_sq_wait - Wait for SQ slots using fair queuing + * @rdma: controlling transport + * @cid: completion ID for tracing + * @sqecount: number of SQ entries needed + * + * A ticket-based system ensures fair ordering when multiple threads + * wait for Send Queue capacity. Each waiter takes a ticket and is + * served in order, preventing starvation. + * + * Protocol invariant: every ticket holder must increment + * sc_sq_ticket_tail exactly once, whether the reservation + * succeeds or the connection closes. Failing to advance the + * tail stalls all subsequent waiters. + * + * The ticket counters are signed 32-bit atomics. After + * wrapping through INT_MAX, the equality check + * (tail == ticket) remains correct because both counters + * advance monotonically and the comparison uses exact + * equality rather than relational operators. + * + * Return values: + * %0: SQ slots were reserved successfully + * %-ENOTCONN: The connection was lost + */ +int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int sqecount) +{ + int ticket; + + /* Fast path: try to reserve SQ slots without waiting. + * + * A failed reservation temporarily understates sc_sq_avail + * until the compensating atomic_add restores it. A Send + * completion arriving in that window sees a lower count + * than reality, but the value self-corrects once the add + * completes. No ordering guarantee is needed here because + * the slow path serializes all contended waiters. + */ + if (likely(atomic_sub_return(sqecount, &rdma->sc_sq_avail) >= 0)) + return 0; + atomic_add(sqecount, &rdma->sc_sq_avail); + + /* Slow path: take a ticket and wait in line */ + ticket = atomic_fetch_inc(&rdma->sc_sq_ticket_head); + + percpu_counter_inc(&svcrdma_stat_sq_starve); + trace_svcrdma_sq_full(rdma, cid); + + /* Wait until all earlier tickets have been served */ + wait_event(rdma->sc_sq_ticket_wait, + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || + atomic_read(&rdma->sc_sq_ticket_tail) == ticket); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + goto out_close; + + /* It's our turn. Wait for enough SQ slots to be available. */ + while (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { + atomic_add(sqecount, &rdma->sc_sq_avail); + + wait_event(rdma->sc_send_wait, + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || + atomic_read(&rdma->sc_sq_avail) >= sqecount); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + goto out_close; + } + + /* Slots reserved successfully. Let the next waiter proceed. */ + atomic_inc(&rdma->sc_sq_ticket_tail); + wake_up(&rdma->sc_sq_ticket_wait); + trace_svcrdma_sq_retry(rdma, cid); + return 0; + +out_close: + atomic_inc(&rdma->sc_sq_ticket_tail); + wake_up(&rdma->sc_sq_ticket_wait); + return -ENOTCONN; +} + +/** + * svc_rdma_post_send_err - Handle ib_post_send failure + * @rdma: controlling transport + * @cid: completion ID for tracing + * @bad_wr: first WR that was not posted + * @first_wr: first WR in the chain + * @sqecount: number of SQ entries that were reserved + * @ret: error code from ib_post_send + * + * Return values: + * %0: At least one WR was posted; a completion handles cleanup + * %-ENOTCONN: No WRs were posted; SQ slots are released + */ +int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, + const struct ib_send_wr *bad_wr, + const struct ib_send_wr *first_wr, + int sqecount, int ret) +{ + trace_svcrdma_sq_post_err(rdma, cid, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + + /* If even one WR was posted, a Send completion will + * return the reserved SQ slots. + */ + if (bad_wr != first_wr) + return 0; + + svc_rdma_wake_send_waiters(rdma, sqecount); + return -ENOTCONN; +} + +/** * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: Completion Queue context * @wc: Work Completion object @@ -326,11 +449,6 @@ flushed: * that these values remain available after the ib_post_send() call. * In some error flow cases, svc_rdma_wc_send() releases @ctxt. * - * Note there is potential for starvation when the Send Queue is - * full because there is no order to when waiting threads are - * awoken. The transport is typically provisioned with a deep - * enough Send Queue that SQ exhaustion should be a rare event. - * * Return values: * %0: @ctxt's WR chain was posted successfully * %-ENOTCONN: The connection was lost @@ -347,47 +465,21 @@ int svc_rdma_post_send(struct svcxprt_rdma *rdma, might_sleep(); /* Sync the transport header buffer */ - ib_dma_sync_single_for_device(rdma->sc_pd->device, + ib_dma_sync_single_for_device(rdma->sc_cm_id->device, send_wr->sg_list[0].addr, send_wr->sg_list[0].length, DMA_TO_DEVICE); - /* If the SQ is full, wait until an SQ entry is available */ - while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { - if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { - svc_rdma_wake_send_waiters(rdma, sqecount); - - /* When the transport is torn down, assume - * ib_drain_sq() will trigger enough Send - * completions to wake us. The XPT_CLOSE test - * above should then cause the while loop to - * exit. - */ - percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &cid); - wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > 0); - trace_svcrdma_sq_retry(rdma, &cid); - continue; - } - - trace_svcrdma_post_send(ctxt); - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - if (ret) { - trace_svcrdma_sq_post_err(rdma, &cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - - /* If even one WR was posted, there will be a - * Send completion that bumps sc_sq_avail. - */ - if (bad_wr == first_wr) { - svc_rdma_wake_send_waiters(rdma, sqecount); - break; - } - } - return 0; - } - return -ENOTCONN; + ret = svc_rdma_sq_wait(rdma, &cid, sqecount); + if (ret < 0) + return ret; + + trace_svcrdma_post_send(ctxt); + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + return svc_rdma_post_send_err(rdma, &cid, bad_wr, + first_wr, sqecount, ret); + return 0; } /** @@ -848,7 +940,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, /* The svc_rqst and all resources it owns are released as soon as * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt - * so they are released by the Send completion handler. + * so they are released only after Send completion, and not by + * svc_rqst_release_pages(). */ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, struct svc_rdma_send_ctxt *ctxt) @@ -860,9 +953,6 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, ctxt->sc_pages[i] = rqstp->rq_respages[i]; rqstp->rq_respages[i] = NULL; } - - /* Prevent svc_xprt_release from releasing pages in rq_pages */ - rqstp->rq_next_page = rqstp->rq_respages; } /* Prepare the portion of the RPC Reply that will be transmitted @@ -966,6 +1056,12 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + + /* Ensure only the error message is posted, not any previously + * prepared Write chunk WRs. + */ + sctxt->sc_wr_chain = &sctxt->sc_send_wr; + sctxt->sc_sqecount = 1; if (svc_rdma_post_send(rdma, sctxt)) goto put_ctxt; return; @@ -1013,7 +1109,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!p) goto put_ctxt; - ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res); + ret = svc_rdma_prepare_write_list(rdma, rctxt, sctxt, &rqstp->rq_res); if (ret < 0) goto put_ctxt; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index c3fbf0779d4a..f18bc60d9f4f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -179,6 +179,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, init_llist_head(&cma_xprt->sc_recv_ctxts); init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); + init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); @@ -406,15 +407,14 @@ static void svc_rdma_xprt_done(struct rpcrdma_notification *rn) */ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) { + unsigned int ctxts, rq_depth, maxpayload; struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; - unsigned int ctxts, rq_depth; struct ib_device *dev; int ret = 0; - RPC_IFDEBUG(struct sockaddr *sap); listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -462,16 +462,24 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_bc_requests = 2; } - /* Arbitrarily estimate the number of rw_ctxs needed for - * this transport. This is enough rw_ctxs to make forward - * progress even if the client is using one rkey per page - * in each Read chunk. + /* Estimate the needed number of rdma_rw contexts. The maximum + * Read and Write chunks have one segment each. Each request + * can involve one Read chunk and either a Write chunk or Reply + * chunk; thus a factor of three. */ - ctxts = 3 * RPCSVC_MAXPAGES; - newxprt->sc_sq_depth = rq_depth + ctxts; + maxpayload = min(xprt->xpt_server->sv_max_payload, + RPCSVC_MAXPAYLOAD_RDMA); + ctxts = newxprt->sc_max_requests * 3 * + rdma_rw_mr_factor(dev, newxprt->sc_port_num, + maxpayload >> PAGE_SHIFT); + + newxprt->sc_sq_depth = rq_depth + + rdma_rw_max_send_wr(dev, newxprt->sc_port_num, ctxts, 0); if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); + atomic_set(&newxprt->sc_sq_ticket_head, 0); + atomic_set(&newxprt->sc_sq_ticket_tail, 0); newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { @@ -554,18 +562,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); - sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; - dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); - sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); - dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); - dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); - dprintk(" rdma_rw_ctxs : %d\n", ctxts); - dprintk(" max_requests : %d\n", newxprt->sc_max_requests); - dprintk(" ord : %d\n", conn_param.initiator_depth); -#endif + if (IS_ENABLED(CONFIG_SUNRPC_DEBUG)) { + struct sockaddr *sap; + + dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); + dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); + dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); + dprintk(" rdma_rw_ctxs : %d\n", ctxts); + dprintk(" max_requests : %d\n", newxprt->sc_max_requests); + dprintk(" ord : %d\n", conn_param.initiator_depth); + } return &newxprt->sc_xprt; @@ -575,6 +585,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) ib_destroy_qp(newxprt->sc_qp); rdma_destroy_id(newxprt->sc_cm_id); + rpcrdma_rn_unregister(dev, &newxprt->sc_rn); /* This call to put will destroy the transport */ svc_xprt_put(&newxprt->sc_xprt); return NULL; @@ -588,12 +599,18 @@ static void svc_rdma_detach(struct svc_xprt *xprt) rdma_disconnect(rdma->sc_cm_id); } -static void __svc_rdma_free(struct work_struct *work) +/** + * svc_rdma_free - Release class-specific transport resources + * @xprt: Generic svc transport object + */ +static void svc_rdma_free(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = - container_of(work, struct svcxprt_rdma, sc_work); + container_of(xprt, struct svcxprt_rdma, sc_xprt); struct ib_device *device = rdma->sc_cm_id->device; + might_sleep(); + /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); @@ -621,19 +638,11 @@ static void __svc_rdma_free(struct work_struct *work) /* Destroy the CM ID */ rdma_destroy_id(rdma->sc_cm_id); - rpcrdma_rn_unregister(device, &rdma->sc_rn); + if (!test_bit(XPT_LISTENER, &rdma->sc_xprt.xpt_flags)) + rpcrdma_rn_unregister(device, &rdma->sc_rn); kfree(rdma); } -static void svc_rdma_free(struct svc_xprt *xprt) -{ - struct svcxprt_rdma *rdma = - container_of(xprt, struct svcxprt_rdma, sc_xprt); - - INIT_WORK(&rdma->sc_work, __svc_rdma_free); - schedule_work(&rdma->sc_work); -} - static int svc_rdma_has_wspace(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = @@ -643,7 +652,8 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) * If there are already waiters on the SQ, * return false. */ - if (waitqueue_active(&rdma->sc_send_wait)) + if (waitqueue_active(&rdma->sc_send_wait) || + waitqueue_active(&rdma->sc_sq_ticket_wait)) return 0; /* Otherwise return true. */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 9a8ce5df83ca..61706df5e485 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -510,8 +510,21 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) return; out_sleep: - task->tk_status = -ENOMEM; - xprt_add_backlog(xprt, task); + task->tk_status = -EAGAIN; + xprt_add_backlog_noncongested(xprt, task); + /* A buffer freed between buffer_get and rpc_sleep_on + * goes back to the pool with no waiter to wake. + * Re-check after joining the backlog to close that gap. + */ + req = rpcrdma_buffer_get(&r_xprt->rx_buf); + if (req) { + struct rpc_rqst *rqst = &req->rl_slot; + + if (!xprt_wake_up_backlog(xprt, rqst)) { + memset(rqst, 0, sizeof(*rqst)); + rpcrdma_buffer_put(&r_xprt->rx_buf, req); + } + } } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 63262ef0c2e3..aecf9c0a153f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -383,7 +383,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ep *ep; int rc; - ep = kzalloc(sizeof(*ep), XPRTRDMA_GFP_FLAGS); + ep = kzalloc_obj(*ep, XPRTRDMA_GFP_FLAGS); if (!ep) return -ENOTCONN; ep->re_xprt = &r_xprt->rx_xprt; @@ -615,8 +615,8 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) { struct rpcrdma_sendctx *sc; - sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge), - XPRTRDMA_GFP_FLAGS); + sc = kzalloc_flex(*sc, sc_sges, ep->re_attr.cap.max_send_sge, + XPRTRDMA_GFP_FLAGS); if (!sc) return NULL; @@ -639,7 +639,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * Sends are posted. */ i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; - buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), XPRTRDMA_GFP_FLAGS); + buf->rb_sc_ctxs = kzalloc_objs(sc, i, XPRTRDMA_GFP_FLAGS); if (!buf->rb_sc_ctxs) return -ENOMEM; @@ -708,6 +708,18 @@ out_emptyq: */ xprt_wait_for_buffer_space(&r_xprt->rx_xprt); r_xprt->rx_stats.empty_sendctx_q++; + + /* Recheck: a Send completion between the ring-empty test + * and the set_bit could cause its xprt_write_space() to + * miss, leaving XPRT_WRITE_SPACE set with a non-full ring. + * The smp_mb__after_atomic() pairs with smp_store_release() + * in rpcrdma_sendctx_put_locked(). + */ + smp_mb__after_atomic(); + next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head); + if (next_head != READ_ONCE(buf->rb_sc_tail)) + xprt_write_space(&r_xprt->rx_xprt); + return NULL; } @@ -739,7 +751,10 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, } while (buf->rb_sc_ctxs[next_tail] != sc); - /* Paired with READ_ONCE */ + /* Paired with READ_ONCE in rpcrdma_sendctx_get_locked(): + * both the fast-path ring-full test and the post-set_bit + * recheck in the slow path depend on this store-release. + */ smp_store_release(&buf->rb_sc_tail, next_tail); xprt_write_space(&r_xprt->rx_xprt); @@ -822,7 +837,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_req *req; - req = kzalloc(sizeof(*req), XPRTRDMA_GFP_FLAGS); + req = kzalloc_obj(*req, XPRTRDMA_GFP_FLAGS); if (req == NULL) goto out1; @@ -952,7 +967,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt) struct ib_device *device = ep->re_id->device; struct rpcrdma_rep *rep; - rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS); + rep = kzalloc_obj(*rep, XPRTRDMA_GFP_FLAGS); if (rep == NULL) goto out; @@ -1359,10 +1374,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) if (likely(ep->re_receive_count > needed)) goto out; needed -= ep->re_receive_count; - needed += RPCRDMA_MAX_RECV_BATCH; + needed += ep->re_recv_batch; if (atomic_inc_return(&ep->re_receiving) > 1) - goto out; + goto out_dec; /* fast path: all needed reps can be found on the free list */ wr = NULL; @@ -1385,7 +1400,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) ++count; } if (!wr) - goto out; + goto out_dec; rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); @@ -1400,9 +1415,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) --count; } } + +out_dec: if (atomic_dec_return(&ep->re_receiving) > 0) complete(&ep->re_done); - out: trace_xprtrdma_post_recvs(r_xprt, count); ep->re_receive_count += count; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8147d2b41494..f53a77472724 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -96,6 +96,7 @@ struct rpcrdma_ep { struct rpcrdma_notification re_rn; int re_receive_count; unsigned int re_max_requests; /* depends on device */ + unsigned int re_recv_batch; unsigned int re_inline_send; /* negotiated */ unsigned int re_inline_recv; /* negotiated */ @@ -283,19 +284,36 @@ struct rpcrdma_mr { * registered or invalidated. Must handle a Reply chunk: */ enum { - RPCRDMA_MAX_IOV_SEGS = 3, + RPCRDMA_MAX_IOV_SEGS = 3, /* head, page-boundary, tail */ RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + RPCRDMA_MAX_IOV_SEGS, }; -/* Arguments for DMA mapping and registration */ -struct rpcrdma_mr_seg { - u32 mr_len; /* length of segment */ - struct page *mr_page; /* underlying struct page */ - u64 mr_offset; /* IN: page offset, OUT: iova */ +/** + * struct rpcrdma_xdr_cursor - tracks position within an xdr_buf + * for iterative MR registration + * @xc_buf: the xdr_buf being iterated + * @xc_page_offset: byte offset into the page region consumed so far + * @xc_flags: combination of XC_* bits + * + * Each XC_*_DONE flag indicates that this region has no + * remaining MR registration work. That condition holds both when the region + * has already been registered by a prior frwr_map() call and + * when the region is excluded from this chunk type (pre-set + * at init time by rpcrdma_xdr_cursor_init()). frwr_map() + * treats the two cases identically: skip the region. + */ +struct rpcrdma_xdr_cursor { + const struct xdr_buf *xc_buf; + unsigned int xc_page_offset; + unsigned int xc_flags; }; +#define XC_HEAD_DONE BIT(0) +#define XC_PAGES_DONE BIT(1) +#define XC_TAIL_DONE BIT(2) + /* The Send SGE array is provisioned to send a maximum size * inline request: * - RPC-over-RDMA header @@ -330,7 +348,6 @@ struct rpcrdma_req { struct list_head rl_free_mrs; struct list_head rl_registered; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * @@ -450,8 +467,8 @@ rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt) } /* Setting this to 0 ensures interoperability with early servers. - * Setting this to 1 enhances certain unaligned read/write performance. - * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ + * Setting this to 1 enhances unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c */ extern int xprt_rdma_pad_optimize; /* This setting controls the hunt for a supported memory @@ -535,10 +552,10 @@ void frwr_reset(struct rpcrdma_req *req); int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); void frwr_mr_release(struct rpcrdma_mr *mr); -struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr *mr); +int frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_xdr_cursor *cur, + bool writing, __be32 xid, + struct rpcrdma_mr *mr); int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 83cc095846d3..2e1fe6013361 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -358,7 +358,7 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) static int xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, - struct cmsghdr *cmsg, int ret) + unsigned int *msg_flags, struct cmsghdr *cmsg, int ret) { u8 content_type = tls_get_record_type(sock->sk, cmsg); u8 level, description; @@ -371,7 +371,7 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, * record, even though there might be more frames * waiting to be decrypted. */ - msg->msg_flags &= ~MSG_EOR; + *msg_flags &= ~MSG_EOR; break; case TLS_RECORD_TYPE_ALERT: tls_alert_recv(sock->sk, msg, &level, &description); @@ -386,19 +386,33 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, } static int -xs_sock_recv_cmsg(struct socket *sock, struct msghdr *msg, int flags) +xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) { union { struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; int ret; - msg->msg_control = &u; - msg->msg_controllen = sizeof(u); - ret = sock_recvmsg(sock, msg, flags); - if (msg->msg_controllen != sizeof(u)) - ret = xs_sock_process_cmsg(sock, msg, &u.cmsg, ret); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, flags); + if (ret > 0) { + if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) + iov_iter_revert(&msg.msg_iter, ret); + ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, + -EAGAIN); + } return ret; } @@ -408,7 +422,13 @@ xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) ssize_t ret; if (seek != 0) iov_iter_advance(&msg->msg_iter, seek); - ret = xs_sock_recv_cmsg(sock, msg, flags); + ret = sock_recvmsg(sock, msg, flags); + /* Handle TLS inband control message lazily */ + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = xs_sock_recv_cmsg(sock, &msg->msg_flags, flags); + } return ret > 0 ? ret + seek : ret; } @@ -434,7 +454,7 @@ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { iov_iter_discard(&msg->msg_iter, ITER_DEST, count); - return xs_sock_recv_cmsg(sock, msg, flags); + return xs_sock_recvmsg(sock, msg, flags, 0); } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE @@ -1825,8 +1845,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { rpc_set_port((struct sockaddr *)&myaddr, port); - err = kernel_bind(sock, (struct sockaddr *)&myaddr, - transport->xprt.addrlen); + err = kernel_bind(sock, (struct sockaddr_unsized *)&myaddr, + transport->xprt.addrlen); if (err == 0) { if (transport->xprt.reuseport) transport->srcport = port; @@ -1985,7 +2005,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, xs_stream_start_connect(transport); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), xprt->addrlen, 0); } /** @@ -2385,7 +2405,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), + xprt->addrlen, O_NONBLOCK); } /** @@ -2726,20 +2747,14 @@ static void xs_tcp_tls_setup_socket(struct work_struct *work) if (status) goto out_close; xprt_release_write(lower_xprt, NULL); - trace_rpc_socket_connect(upper_xprt, upper_transport->sock, 0); - if (!xprt_test_and_set_connected(upper_xprt)) { - upper_xprt->connect_cookie++; - clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); - xprt_clear_connecting(upper_xprt); - - upper_xprt->stat.connect_count++; - upper_xprt->stat.connect_time += (long)jiffies - - upper_xprt->stat.connect_start; - xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); - } rpc_shutdown_client(lower_clnt); + /* Check for ingress data that arrived before the socket's + * ->data_ready callback was set up. + */ + xs_poll_check_readable(upper_transport); + out_unlock: current_restore_flags(pflags, PF_MEMALLOC); upper_transport->clnt = NULL; |
