diff options
Diffstat (limited to 'net/sunrpc')
37 files changed, 1308 insertions, 877 deletions
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index a7e42f9a405c..2bff63a73cf8 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -551,7 +551,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, *entry, *new; unsigned int nr; - nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits); + nr = auth->au_ops->hash_cred(acred, cache->hashbits); rcu_read_lock(); hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 168219535a34..f1df9837f1ac 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -78,6 +78,14 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task, return auth->au_ops->lookup_cred(auth, acred, lookupflags); } +static int +generic_hash_cred(struct auth_cred *acred, unsigned int hashbits) +{ + return hash_64(from_kgid(&init_user_ns, acred->gid) | + ((u64)from_kuid(&init_user_ns, acred->uid) << + (sizeof(gid_t) * 8)), hashbits); +} + /* * Lookup generic creds for current process */ @@ -176,8 +184,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) if (gcred->acred.group_info->ngroups != acred->group_info->ngroups) goto out_nomatch; for (i = 0; i < gcred->acred.group_info->ngroups; i++) { - if (!gid_eq(GROUP_AT(gcred->acred.group_info, i), - GROUP_AT(acred->group_info, i))) + if (!gid_eq(gcred->acred.group_info->gid[i], + acred->group_info->gid[i])) goto out_nomatch; } out_match: @@ -258,6 +266,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) static const struct rpc_authops generic_auth_ops = { .owner = THIS_MODULE, .au_name = "Generic", + .hash_cred = generic_hash_cred, .lookup_cred = generic_lookup_cred, .crcreate = generic_create_cred, .key_timeout = generic_key_timeout, diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 976c7812bbd5..cdeb1d814833 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -50,7 +50,7 @@ #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/gss_api.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/hashtable.h> #include "../netns.h" @@ -541,9 +541,13 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) return gss_new; gss_msg = gss_add_msg(gss_new); if (gss_msg == gss_new) { - int res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg); + int res; + atomic_inc(&gss_msg->count); + res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg); if (res) { gss_unhash_msg(gss_new); + atomic_dec(&gss_msg->count); + gss_release_msg(gss_new); gss_msg = ERR_PTR(res); } } else @@ -836,6 +840,7 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) warn_gssd(); gss_release_msg(gss_msg); } + gss_release_msg(gss_msg); } static void gss_pipe_dentry_destroy(struct dentry *dir, @@ -1298,6 +1303,12 @@ gss_destroy_cred(struct rpc_cred *cred) gss_destroy_nullcred(cred); } +static int +gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) +{ + return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits); +} + /* * Lookup RPCSEC_GSS cred for the current process */ @@ -1610,7 +1621,7 @@ gss_validate(struct rpc_task *task, __be32 *p) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); - __be32 seq; + __be32 *seq = NULL; struct kvec iov; struct xdr_buf verf_buf; struct xdr_netobj mic; @@ -1625,9 +1636,12 @@ gss_validate(struct rpc_task *task, __be32 *p) goto out_bad; if (flav != RPC_AUTH_GSS) goto out_bad; - seq = htonl(task->tk_rqstp->rq_seqno); - iov.iov_base = &seq; - iov.iov_len = sizeof(seq); + seq = kmalloc(4, GFP_NOFS); + if (!seq) + goto out_bad; + *seq = htonl(task->tk_rqstp->rq_seqno); + iov.iov_base = seq; + iov.iov_len = 4; xdr_buf_from_iov(&iov, &verf_buf); mic.data = (u8 *)p; mic.len = len; @@ -1647,11 +1661,13 @@ gss_validate(struct rpc_task *task, __be32 *p) gss_put_ctx(ctx); dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n", task->tk_pid, __func__); + kfree(seq); return p + XDR_QUADLEN(len); out_bad: gss_put_ctx(ctx); dprintk("RPC: %5u %s failed ret %ld.\n", task->tk_pid, __func__, PTR_ERR(ret)); + kfree(seq); return ret; } @@ -1982,6 +1998,7 @@ static const struct rpc_authops authgss_ops = { .au_name = "RPCSEC_GSS", .create = gss_create, .destroy = gss_destroy, + .hash_cred = gss_hash_cred, .lookup_cred = gss_lookup_cred, .crcreate = gss_create_cred, .list_pseudoflavors = gss_mech_list_pseudoflavors, diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 244245bcbbd2..fb39284ec174 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -166,8 +166,8 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, unsigned int usage, struct xdr_netobj *cksumout) { struct scatterlist sg[1]; - int err; - u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + int err = -1; + u8 *checksumdata; u8 rc4salt[4]; struct crypto_ahash *md5; struct crypto_ahash *hmac_md5; @@ -187,23 +187,22 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } + checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); + if (!checksumdata) + return GSS_S_FAILURE; + md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(md5)) - return GSS_S_FAILURE; + goto out_free_cksum; hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(hmac_md5)) { - crypto_free_ahash(md5); - return GSS_S_FAILURE; - } + if (IS_ERR(hmac_md5)) + goto out_free_md5; - req = ahash_request_alloc(md5, GFP_KERNEL); - if (!req) { - crypto_free_ahash(hmac_md5); - crypto_free_ahash(md5); - return GSS_S_FAILURE; - } + req = ahash_request_alloc(md5, GFP_NOFS); + if (!req) + goto out_free_hmac_md5; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); @@ -231,12 +230,9 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, goto out; ahash_request_free(req); - req = ahash_request_alloc(hmac_md5, GFP_KERNEL); - if (!req) { - crypto_free_ahash(hmac_md5); - crypto_free_ahash(md5); - return GSS_S_FAILURE; - } + req = ahash_request_alloc(hmac_md5, GFP_NOFS); + if (!req) + goto out_free_hmac_md5; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); @@ -258,8 +254,12 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, cksumout->len = kctx->gk5e->cksumlength; out: ahash_request_free(req); - crypto_free_ahash(md5); +out_free_hmac_md5: crypto_free_ahash(hmac_md5); +out_free_md5: + crypto_free_ahash(md5); +out_free_cksum: + kfree(checksumdata); return err ? GSS_S_FAILURE : 0; } @@ -276,8 +276,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, struct crypto_ahash *tfm; struct ahash_request *req; struct scatterlist sg[1]; - int err; - u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + int err = -1; + u8 *checksumdata; unsigned int checksumlen; if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR) @@ -291,15 +291,17 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } + checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); + if (checksumdata == NULL) + return GSS_S_FAILURE; + tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) - return GSS_S_FAILURE; + goto out_free_cksum; - req = ahash_request_alloc(tfm, GFP_KERNEL); - if (!req) { - crypto_free_ahash(tfm); - return GSS_S_FAILURE; - } + req = ahash_request_alloc(tfm, GFP_NOFS); + if (!req) + goto out_free_ahash; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); @@ -349,7 +351,10 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, cksumout->len = kctx->gk5e->cksumlength; out: ahash_request_free(req); +out_free_ahash: crypto_free_ahash(tfm); +out_free_cksum: + kfree(checksumdata); return err ? GSS_S_FAILURE : 0; } @@ -368,8 +373,8 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, struct crypto_ahash *tfm; struct ahash_request *req; struct scatterlist sg[1]; - int err; - u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + int err = -1; + u8 *checksumdata; unsigned int checksumlen; if (kctx->gk5e->keyed_cksum == 0) { @@ -383,16 +388,18 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } + checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); + if (!checksumdata) + return GSS_S_FAILURE; + tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) - return GSS_S_FAILURE; + goto out_free_cksum; checksumlen = crypto_ahash_digestsize(tfm); - req = ahash_request_alloc(tfm, GFP_KERNEL); - if (!req) { - crypto_free_ahash(tfm); - return GSS_S_FAILURE; - } + req = ahash_request_alloc(tfm, GFP_NOFS); + if (!req) + goto out_free_ahash; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); @@ -433,7 +440,10 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, } out: ahash_request_free(req); +out_free_ahash: crypto_free_ahash(tfm); +out_free_cksum: + kfree(checksumdata); return err ? GSS_S_FAILURE : 0; } @@ -666,14 +676,17 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, u32 ret; struct scatterlist sg[1]; SKCIPHER_REQUEST_ON_STACK(req, cipher); - u8 data[GSS_KRB5_MAX_BLOCKSIZE * 2]; + u8 *data; struct page **save_pages; u32 len = buf->len - offset; - if (len > ARRAY_SIZE(data)) { + if (len > GSS_KRB5_MAX_BLOCKSIZE * 2) { WARN_ON(0); return -ENOMEM; } + data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_NOFS); + if (!data) + return -ENOMEM; /* * For encryption, we want to read from the cleartext @@ -708,6 +721,7 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, ret = write_bytes_to_xdr_buf(buf, offset, data, len); out: + kfree(data); return ret; } @@ -949,7 +963,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, } desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), - GFP_KERNEL); + GFP_NOFS); if (!desc) { dprintk("%s: failed to allocate shash descriptor for '%s'\n", __func__, kctx->gk5e->cksum_name); @@ -1016,7 +1030,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, } desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), - GFP_KERNEL); + GFP_NOFS); if (!desc) { dprintk("%s: failed to allocate shash descriptor for '%s'\n", __func__, kctx->gk5e->cksum_name); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 60595835317a..7bb2514aadd9 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -451,8 +451,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx) goto out_err_free_hmac; - desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), - GFP_KERNEL); + desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS); if (!desc) { dprintk("%s: failed to allocate hash descriptor for '%s'\n", __func__, ctx->gk5e->cksum_name); diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index eeeba5adee6d..25d9a9cf7b66 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -229,7 +229,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, kgid = make_kgid(&init_user_ns, tmp); if (!gid_valid(kgid)) goto out_free_groups; - GROUP_AT(creds->cr_group_info, i) = kgid; + creds->cr_group_info->gid[i] = kgid; } return 0; @@ -260,7 +260,7 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, if (!oa->data) return -ENOMEM; - creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL); + creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL); if (!creds) { kfree(oa->data); return -ENOMEM; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index d8582028b346..153082598522 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -479,7 +479,7 @@ static int rsc_parse(struct cache_detail *cd, kgid = make_kgid(&init_user_ns, id); if (!gid_valid(kgid)) goto out; - GROUP_AT(rsci.cred.cr_group_info, i) = kgid; + rsci.cred.cr_group_info->gid[i] = kgid; } /* mech name */ @@ -718,30 +718,37 @@ gss_write_null_verf(struct svc_rqst *rqstp) static int gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) { - __be32 xdr_seq; + __be32 *xdr_seq; u32 maj_stat; struct xdr_buf verf_data; struct xdr_netobj mic; __be32 *p; struct kvec iov; + int err = -1; svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS); - xdr_seq = htonl(seq); + xdr_seq = kmalloc(4, GFP_KERNEL); + if (!xdr_seq) + return -1; + *xdr_seq = htonl(seq); - iov.iov_base = &xdr_seq; - iov.iov_len = sizeof(xdr_seq); + iov.iov_base = xdr_seq; + iov.iov_len = 4; xdr_buf_from_iov(&iov, &verf_data); p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx_id, &verf_data, &mic); if (maj_stat != GSS_S_COMPLETE) - return -1; + goto out; *p++ = htonl(mic.len); memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len); p += XDR_QUADLEN(mic.len); if (!xdr_ressize_check(rqstp, p)) - return -1; - return 0; + goto out; + err = 0; +out: + kfree(xdr_seq); + return err; } struct gss_domain { @@ -1482,7 +1489,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) case RPC_GSS_PROC_DESTROY: if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; - rsci->h.expiry_time = get_seconds(); + rsci->h.expiry_time = seconds_since_boot(); set_bit(CACHE_NEGATIVE, &rsci->h.flags); if (resv->iov_len + 4 > PAGE_SIZE) goto drop; @@ -1541,7 +1548,7 @@ complete: ret = SVC_COMPLETE; goto out; drop: - ret = SVC_DROP; + ret = SVC_CLOSE; out: if (rsci) cache_put(&rsci->h, sn->rsc_cache); diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index a99278c984e8..306fc0f54596 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -46,6 +46,14 @@ unx_destroy(struct rpc_auth *auth) rpcauth_clear_credcache(auth->au_credcache); } +static int +unx_hash_cred(struct auth_cred *acred, unsigned int hashbits) +{ + return hash_64(from_kgid(&init_user_ns, acred->gid) | + ((u64)from_kuid(&init_user_ns, acred->uid) << + (sizeof(gid_t) * 8)), hashbits); +} + /* * Lookup AUTH_UNIX creds for current process */ @@ -79,7 +87,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t cred->uc_gid = acred->gid; for (i = 0; i < groups; i++) - cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + cred->uc_gids[i] = acred->group_info->gid[i]; if (i < NFS_NGROUPS) cred->uc_gids[i] = INVALID_GID; @@ -127,7 +135,7 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) if (groups > NFS_NGROUPS) groups = NFS_NGROUPS; for (i = 0; i < groups ; i++) - if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i))) + if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) return 0; if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) return 0; @@ -220,6 +228,7 @@ const struct rpc_authops authunix_ops = { .au_name = "UNIX", .create = unx_create, .destroy = unx_destroy, + .hash_cred = unx_hash_cred, .lookup_cred = unx_lookup_cred, .crcreate = unx_create_cred, }; diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 229956bf8457..ac701c28f44f 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) page = alloc_page(gfp_flags); if (page == NULL) return -ENOMEM; - buf->head[0].iov_base = page_address(page); - buf->head[0].iov_len = PAGE_SIZE; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = PAGE_SIZE; + xdr_buf_init(buf, page_address(page), PAGE_SIZE); return 0; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 4d8e11f94a35..f39e3e11f9aa 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -21,7 +21,7 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/string_helpers.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/poll.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> @@ -353,7 +353,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) spin_unlock(&cache_list_lock); /* start the cleaning process */ - schedule_delayed_work(&cache_cleaner, 0); + queue_delayed_work(system_power_efficient_wq, &cache_cleaner, 0); } EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail); @@ -476,7 +476,8 @@ static void do_cache_clean(struct work_struct *work) delay = 0; if (delay) - schedule_delayed_work(&cache_cleaner, delay); + queue_delayed_work(system_power_efficient_wq, + &cache_cleaner, delay); } @@ -1357,7 +1358,7 @@ static int c_show(struct seq_file *m, void *p) ifdebug(CACHE) seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", convert_to_wallclock(cp->expiry_time), - atomic_read(&cp->ref.refcount), cp->flags); + kref_read(&cp->ref), cp->flags); cache_get(cp); if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 66f23b376fa0..1dc9f3bac099 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -184,7 +184,6 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { struct dentry *dentry; - int err = 0; switch (event) { case RPC_PIPEFS_MOUNT: @@ -201,7 +200,7 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event); return -ENOTSUPP; } - return err; + return 0; } static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, @@ -337,6 +336,11 @@ out: static DEFINE_IDA(rpc_clids); +void rpc_cleanup_clids(void) +{ + ida_destroy(&rpc_clids); +} + static int rpc_alloc_clid(struct rpc_clnt *clnt) { int clid; @@ -988,7 +992,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) { if (clnt != NULL) { - rpc_task_release_client(task); if (task->tk_xprt == NULL) task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi); task->tk_client = clnt; @@ -1693,6 +1696,7 @@ call_allocate(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct rpc_procinfo *proc = task->tk_msg.rpc_proc; + int status; dprint_status(task); @@ -1718,11 +1722,14 @@ call_allocate(struct rpc_task *task) req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen; req->rq_rcvsize <<= 2; - req->rq_buffer = xprt->ops->buf_alloc(task, - req->rq_callsize + req->rq_rcvsize); - if (req->rq_buffer != NULL) - return; + status = xprt->ops->buf_alloc(task); xprt_inject_disconnect(xprt); + if (status == 0) + return; + if (status != -ENOMEM) { + rpc_exit(task, status); + return; + } dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); @@ -1748,18 +1755,6 @@ rpc_task_force_reencode(struct rpc_task *task) task->tk_rqstp->rq_bytes_sent = 0; } -static inline void -rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) -{ - buf->head[0].iov_base = start; - buf->head[0].iov_len = len; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->flags = 0; - buf->len = 0; - buf->buflen = len; -} - /* * 3. Encode arguments of an RPC call */ @@ -1772,12 +1767,12 @@ rpc_xdr_encode(struct rpc_task *task) dprint_status(task); - rpc_xdr_buf_init(&req->rq_snd_buf, - req->rq_buffer, - req->rq_callsize); - rpc_xdr_buf_init(&req->rq_rcv_buf, - (char *)req->rq_buffer + req->rq_callsize, - req->rq_rcvsize); + xdr_buf_init(&req->rq_snd_buf, + req->rq_buffer, + req->rq_callsize); + xdr_buf_init(&req->rq_rcv_buf, + req->rq_rbuffer, + req->rq_rcvsize); p = rpc_encode_header(task); if (p == NULL) { @@ -1936,6 +1931,8 @@ call_connect_status(struct rpc_task *task) case -EADDRINUSE: case -ENOBUFS: case -EPIPE: + xprt_conditional_disconnect(task->tk_rqstp->rq_xprt, + task->tk_rqstp->rq_connect_cookie); if (RPC_IS_SOFTCONN(task)) break; /* retry with existing socket, after a delay */ @@ -2616,6 +2613,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); /** + * rpc_clnt_setup_test_and_add_xprt() + * + * This is an rpc_clnt_add_xprt setup() function which returns 1 so: + * 1) caller of the test function must dereference the rpc_xprt_switch + * and the rpc_xprt. + * 2) test function must call rpc_xprt_switch_add_xprt, usually in + * the rpc_call_done routine. + * + * Upon success (return of 1), the test function adds the new + * transport to the rpc_clnt xprt switch + * + * @clnt: struct rpc_clnt to get the new transport + * @xps: the rpc_xprt_switch to hold the new transport + * @xprt: the rpc_xprt to test + * @data: a struct rpc_add_xprt_test pointer that holds the test function + * and test function call data + */ +int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xps, + struct rpc_xprt *xprt, + void *data) +{ + struct rpc_cred *cred; + struct rpc_task *task; + struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data; + int status = -EADDRINUSE; + + xprt = xprt_get(xprt); + xprt_switch_get(xps); + + if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) + goto out_err; + + /* Test the connection */ + cred = authnull_ops.lookup_cred(NULL, NULL, 0); + task = rpc_call_null_helper(clnt, xprt, cred, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + NULL, NULL); + put_rpccred(cred); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out_err; + } + status = task->tk_status; + rpc_put_task(task); + + if (status < 0) + goto out_err; + + /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ + xtest->add_xprt_test(clnt, xprt, xtest->data); + + /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ + return 1; +out_err: + xprt_put(xprt); + xprt_switch_put(xps); + pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n", + status, xprt->address_strings[RPC_DISPLAY_ADDR]); + return status; +} +EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt); + +/** * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt * @clnt: pointer to struct rpc_clnt * @xprtargs: pointer to struct xprt_create @@ -2697,6 +2758,37 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) } EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); +void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) +{ + rcu_read_lock(); + xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); + +void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + rcu_read_lock(); + rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), + xprt); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); + +bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, + const struct sockaddr *sap) +{ + struct rpc_xprt_switch *xps; + bool ret; + + rcu_read_lock(); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + ret = rpc_xprt_switch_has_addr(xps, sap); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index df5826876535..394ce523174c 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -34,7 +34,7 @@ struct sunrpc_net { struct proc_dir_entry *use_gssp_proc; }; -extern int sunrpc_net_id; +extern unsigned int sunrpc_net_id; int ip_map_cache_create(struct net *); void ip_map_cache_destroy(struct net *); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 84f98cbe31c3..61a504fb1ae2 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -477,7 +477,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode) return NULL; inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_fop = &simple_dir_operations; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9ae588511aaf..5db68b371db2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work) } /** - * rpc_malloc - allocate an RPC buffer - * @task: RPC task that will use this buffer - * @size: requested byte size + * rpc_malloc - allocate RPC buffer resources + * @task: RPC task + * + * A single memory region is allocated, which is split between the + * RPC call and RPC reply that this task is being used for. When + * this RPC is retired, the memory is released by calling rpc_free. * * To prevent rpciod from hanging, this allocator never sleeps, - * returning NULL and suppressing warning if the request cannot be serviced - * immediately. - * The caller can arrange to sleep in a way that is safe for rpciod. + * returning -ENOMEM and suppressing warning if the request cannot + * be serviced immediately. The caller can arrange to sleep in a + * way that is safe for rpciod. * * Most requests are 'small' (under 2KiB) and can be serviced from a * mempool, ensuring that NFS reads and writes can always proceed, @@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work) * In order to avoid memory starvation triggering more writebacks of * NFS requests, we avoid using GFP_KERNEL. */ -void *rpc_malloc(struct rpc_task *task, size_t size) +int rpc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize + rqst->rq_rcvsize; struct rpc_buffer *buf; gfp_t gfp = GFP_NOIO | __GFP_NOWARN; @@ -880,28 +885,28 @@ void *rpc_malloc(struct rpc_task *task, size_t size) buf = kmalloc(size, gfp); if (!buf) - return NULL; + return -ENOMEM; buf->len = size; dprintk("RPC: %5u allocated buffer of size %zu at %p\n", task->tk_pid, size, buf); - return &buf->data; + rqst->rq_buffer = buf->data; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; + return 0; } EXPORT_SYMBOL_GPL(rpc_malloc); /** - * rpc_free - free buffer allocated via rpc_malloc - * @buffer: buffer to free + * rpc_free - free RPC buffer resources allocated via rpc_malloc + * @task: RPC task * */ -void rpc_free(void *buffer) +void rpc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; size_t size; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); size = buf->len; diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 2ecb994314c1..caeb01ad2b5a 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -157,15 +157,17 @@ void rpc_count_iostats_metrics(const struct rpc_task *task, spin_lock(&op_metrics->om_lock); op_metrics->om_ops++; - op_metrics->om_ntrans += req->rq_ntrans; + /* kernel API: om_ops must never become larger than om_ntrans */ + op_metrics->om_ntrans += max(req->rq_ntrans, 1); op_metrics->om_timeouts += task->tk_timeouts; op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; - delta = ktime_sub(req->rq_xtime, task->tk_start); - op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); - + if (ktime_to_ns(req->rq_xtime)) { + delta = ktime_sub(req->rq_xtime, task->tk_start); + op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); + } op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); delta = ktime_sub(now, task->tk_start); diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index ee5d3d253102..c73de181467a 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -24,7 +24,7 @@ #include "netns.h" -int sunrpc_net_id; +unsigned int sunrpc_net_id; EXPORT_SYMBOL_GPL(sunrpc_net_id); static __net_init int sunrpc_init_net(struct net *net) @@ -119,6 +119,7 @@ out: static void __exit cleanup_sunrpc(void) { + rpc_cleanup_clids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c5b0cb4f4056..75f290bddca1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -401,6 +401,21 @@ int svc_bind(struct svc_serv *serv, struct net *net) } EXPORT_SYMBOL_GPL(svc_bind); +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static void +__svc_init_bc(struct svc_serv *serv) +{ + INIT_LIST_HEAD(&serv->sv_cb_list); + spin_lock_init(&serv->sv_cb_lock); + init_waitqueue_head(&serv->sv_cb_waitq); +} +#else +static void +__svc_init_bc(struct svc_serv *serv) +{ +} +#endif + /* * Create an RPC service */ @@ -443,6 +458,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, init_timer(&serv->sv_temptimer); spin_lock_init(&serv->sv_lock); + __svc_init_bc(serv); + serv->sv_nrpools = npools; serv->sv_pools = kcalloc(serv->sv_nrpools, sizeof(struct svc_pool), @@ -1138,8 +1155,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) case SVC_DENIED: goto err_bad_auth; case SVC_CLOSE: - if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) - svc_close_xprt(rqstp->rq_xprt); + goto close; case SVC_DROP: goto dropit; case SVC_COMPLETE: @@ -1229,7 +1245,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) sendit: if (svc_authorise(rqstp)) - goto dropit; + goto close; return 1; /* Caller can now send it */ dropit: @@ -1237,11 +1253,16 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) dprintk("svc: svc_process dropit\n"); return 0; + close: + if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) + svc_close_xprt(rqstp->rq_xprt); + dprintk("svc: svc_process close\n"); + return 0; + err_short_len: svc_printk(rqstp, "short len %Zd, dropping request\n", argv->iov_len); - - goto dropit; /* drop request */ + goto close; err_bad_rpc: serv->sv_stats->rpcbadfmt++; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c3f652395a80..7bfe1fb42add 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) svc_xprt_get(xprt); dprintk("svc: transport %p dequeued, inuse=%d\n", - xprt, atomic_read(&xprt->xpt_ref.refcount)); + xprt, kref_read(&xprt->xpt_ref)); } spin_unlock_bh(&pool->sp_lock); out: @@ -799,6 +799,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { dprintk("svc_recv: found XPT_CLOSE\n"); + if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags)) + xprt->xpt_ops->xpo_kill_temp_xprt(xprt); svc_delete_xprt(xprt); /* Leave XPT_BUSY set on the dead xprt: */ goto out; @@ -820,7 +822,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", rqstp, rqstp->rq_pool->sp_id, xprt, - atomic_read(&xprt->xpt_ref.refcount)); + kref_read(&xprt->xpt_ref)); rqstp->rq_deferred = svc_deferred_dequeue(xprt); if (rqstp->rq_deferred) len = svc_deferred_recv(rqstp); @@ -978,7 +980,7 @@ static void svc_age_temp_xprts(unsigned long closure) * through, close it. */ if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) continue; - if (atomic_read(&xprt->xpt_ref.refcount) > 1 || + if (kref_read(&xprt->xpt_ref) > 1 || test_bit(XPT_BUSY, &xprt->xpt_flags)) continue; list_del_init(le); @@ -1002,14 +1004,8 @@ static void svc_age_temp_xprts(unsigned long closure) void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) { struct svc_xprt *xprt; - struct svc_sock *svsk; - struct socket *sock; struct list_head *le, *next; LIST_HEAD(to_be_closed); - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; spin_lock_bh(&serv->sv_lock); list_for_each_safe(le, next, &serv->sv_tempsocks) { @@ -1026,12 +1022,11 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) le = to_be_closed.next; list_del_init(le); xprt = list_entry(le, struct svc_xprt, xpt_list); - dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); - svc_close_xprt(xprt); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + set_bit(XPT_KILL_TEMP, &xprt->xpt_flags); + dprintk("svc_age_temp_xprts_now: queuing xprt %p for closing\n", + xprt); + svc_xprt_enqueue(xprt); } } EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now); diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 69841db1f533..bb8db3cb8032 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -124,16 +124,20 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister); #define DN_HASHMAX (1<<DN_HASHBITS) static struct hlist_head auth_domain_table[DN_HASHMAX]; -static spinlock_t auth_domain_lock = - __SPIN_LOCK_UNLOCKED(auth_domain_lock); +static DEFINE_SPINLOCK(auth_domain_lock); + +static void auth_domain_release(struct kref *kref) +{ + struct auth_domain *dom = container_of(kref, struct auth_domain, ref); + + hlist_del(&dom->hash); + dom->flavour->domain_release(dom); + spin_unlock(&auth_domain_lock); +} void auth_domain_put(struct auth_domain *dom) { - if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) { - hlist_del(&dom->hash); - dom->flavour->domain_release(dom); - spin_unlock(&auth_domain_lock); - } + kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock); } EXPORT_SYMBOL_GPL(auth_domain_put); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index dfacdc95b3f5..64af4f034de6 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -517,7 +517,7 @@ static int unix_gid_parse(struct cache_detail *cd, kgid = make_kgid(&init_user_ns, gid); if (!gid_valid(kgid)) goto out; - GROUP_AT(ug.gi, i) = kgid; + ug.gi->gid[i] = kgid; } ugp = unix_gid_lookup(cd, uid); @@ -564,7 +564,7 @@ static int unix_gid_show(struct seq_file *m, seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen); for (i = 0; i < glen; i++) - seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i))); + seq_printf(m, " %d", from_kgid_munged(user_ns, ug->gi->gid[i])); seq_printf(m, "\n"); return 0; } @@ -817,7 +817,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) return SVC_CLOSE; for (i = 0; i < slen; i++) { kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); - GROUP_AT(cred->cr_group_info, i) = kgid; + cred->cr_group_info->gid[i] = kgid; } if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 57625f64efd5..de066acdb34e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -39,9 +39,10 @@ #include <net/checksum.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/udp.h> #include <net/tcp.h> #include <net/tcp_states.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ioctls.h> #include <trace/events/skb.h> @@ -129,6 +130,18 @@ static void svc_release_skb(struct svc_rqst *rqstp) } } +static void svc_release_udp_skb(struct svc_rqst *rqstp) +{ + struct sk_buff *skb = rqstp->rq_xprt_ctxt; + + if (skb) { + rqstp->rq_xprt_ctxt = NULL; + + dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); + consume_skb(skb); + } +} + union svc_pktinfo_u { struct in_pktinfo pkti; struct in6_pktinfo pkti6; @@ -438,6 +451,21 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); } +static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) +{ + struct svc_sock *svsk; + struct socket *sock; + struct linger no_linger = { + .l_onoff = 1, + .l_linger = 0, + }; + + svsk = container_of(xprt, struct svc_sock, sk_xprt); + sock = svsk->sk_sock; + kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&no_linger, sizeof(no_linger)); +} + /* * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo */ @@ -534,7 +562,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 0, 0, MSG_PEEK | MSG_DONTWAIT); if (err >= 0) - skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err); + skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err); if (skb == NULL) { if (err != -EAGAIN) { @@ -546,7 +574,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) } len = svc_addr_len(svc_addr(rqstp)); rqstp->rq_addrlen = len; - if (skb->tstamp.tv64 == 0) { + if (skb->tstamp == 0) { skb->tstamp = ktime_get_real(); /* Don't enable netstamp, sunrpc doesn't need that much accuracy */ @@ -575,7 +603,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) goto out_free; } local_bh_enable(); - skb_free_datagram_locked(svsk->sk_sk, skb); + consume_skb(skb); } else { /* we can use it in-place */ rqstp->rq_arg.head[0].iov_base = skb->data; @@ -602,8 +630,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) return len; out_free: - trace_kfree_skb(skb, svc_udp_recvfrom); - skb_free_datagram_locked(svsk->sk_sk, skb); + kfree_skb(skb); return 0; } @@ -648,6 +675,10 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt) return NULL; } +static void svc_udp_kill_temp_xprt(struct svc_xprt *xprt) +{ +} + static struct svc_xprt *svc_udp_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -660,13 +691,14 @@ static struct svc_xprt_ops svc_udp_ops = { .xpo_create = svc_udp_create, .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, - .xpo_release_rqst = svc_release_skb, + .xpo_release_rqst = svc_release_udp_skb, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, .xpo_secure_port = svc_sock_secure_port, + .xpo_kill_temp_xprt = svc_udp_kill_temp_xprt, }; static struct svc_xprt_class svc_udp_class = { @@ -1242,6 +1274,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, + .xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt, }; static struct svc_xprt_class svc_tcp_class = { diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index c88d9bc06f5c..8c3936403fea 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -14,7 +14,7 @@ #include <linux/sysctl.h> #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/stats.h> diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index c4f3cc0c0775..7f1071e103ca 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -767,7 +767,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr) newbase -= xdr->buf->page_base; if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len); + xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) @@ -776,7 +776,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr) xdr_set_next_page(xdr); else if (xdr->iov == xdr->buf->head) { if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len); + xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); } return xdr->p != xdr->end; } @@ -859,12 +859,15 @@ EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer); static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; - void *cpdest = xdr->scratch.iov_base; + char *cpdest = xdr->scratch.iov_base; size_t cplen = (char *)xdr->end - (char *)xdr->p; if (nbytes > xdr->scratch.iov_len) return NULL; - memcpy(cpdest, xdr->p, cplen); + p = __xdr_inline_decode(xdr, cplen); + if (p == NULL) + return NULL; + memcpy(cpdest, p, cplen); cpdest += cplen; nbytes -= cplen; if (!xdr_set_next_buffer(xdr)) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ea244b29138b..9a6be030ca7d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -669,7 +669,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) spin_lock_bh(&xprt->transport_lock); if (cookie != xprt->connect_cookie) goto out; - if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt)) + if (test_bit(XPRT_CLOSING, &xprt->state)) goto out; set_bit(XPRT_CLOSE_WAIT, &xprt->state); /* Try to schedule an autoclose RPC call */ @@ -772,6 +772,7 @@ void xprt_connect(struct rpc_task *task) if (!xprt_connected(xprt)) { task->tk_rqstp->rq_bytes_sent = 0; task->tk_timeout = task->tk_rqstp->rq_timeout; + task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; rpc_sleep_on(&xprt->pending, task, xprt_connect_status); if (test_bit(XPRT_CLOSING, &xprt->state)) @@ -1295,7 +1296,7 @@ void xprt_release(struct rpc_task *task) xprt_schedule_autodisconnect(xprt); spin_unlock_bh(&xprt->transport_lock); if (req->rq_buffer) - xprt->ops->buf_free(req->rq_buffer); + xprt->ops->buf_free(task); xprt_inject_disconnect(xprt); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 66c9d63f4797..ae92a9e9ba52 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -15,6 +15,7 @@ #include <asm/cmpxchg.h> #include <linux/spinlock.h> #include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtmultipath.h> typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head, @@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, if (xprt == NULL) return; spin_lock(&xps->xps_lock); - if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) + if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) && + !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); } @@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) return xprt_switch_find_current_entry(head, xpi->xpi_cursor); } +bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap) +{ + struct list_head *head; + struct rpc_xprt *pos; + + if (xps == NULL || sap == NULL) + return false; + + head = &xps->xps_xprt_list; + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) { + pr_info("RPC: addr %s already in xprt switch\n", + pos->address_strings[RPC_DISPLAY_ADDR]); + return true; + } + } + return false; +} + static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, const struct rpc_xprt *cur) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 87762d976b63..24fedd4b117e 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -27,7 +27,7 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, list_del(&req->rl_all); spin_unlock(&buf->rb_reqslock); - rpcrdma_destroy_req(&r_xprt->rx_ia, req); + rpcrdma_destroy_req(req); kfree(rqst); } @@ -35,10 +35,8 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; - struct xdr_buf *buf; size_t size; req = rpcrdma_create_req(r_xprt); @@ -46,30 +44,20 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, return PTR_ERR(req); req->rl_backchannel = true; - size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); - rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, + DMA_TO_DEVICE, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; req->rl_rdmabuf = rb; - size += RPCRDMA_INLINE_READ_THRESHOLD(rqst); - rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + size = r_xprt->rx_data.inline_rsize; + rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; - rb->rg_owner = req; req->rl_sendbuf = rb; - /* so that rpcr_to_rdmar works when receiving a request */ - rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; - - buf = &rqst->rq_snd_buf; - buf->head[0].iov_base = rqst->rq_buffer; - buf->head[0].iov_len = 0; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = size; - + xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, + min_t(size_t, size, PAGE_SIZE)); + rpcrdma_set_xprtdata(rqst, req); return 0; out_fail: @@ -204,6 +192,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) size_t maxmsg; maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize); + maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE); return maxmsg - RPCRDMA_HDRLEN_MIN; } @@ -219,7 +208,6 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_msg *headerp; - size_t rpclen; headerp = rdmab_to_msg(req->rl_rdmabuf); headerp->rm_xid = rqst->rq_xid; @@ -231,26 +219,9 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) headerp->rm_body.rm_chunks[1] = xdr_zero; headerp->rm_body.rm_chunks[2] = xdr_zero; - rpclen = rqst->rq_svec[0].iov_len; - -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n", - __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf)); - pr_info("RPC: %s: RPC/RDMA: %*ph\n", - __func__, (int)RPCRDMA_HDRLEN_MIN, headerp); - pr_info("RPC: %s: RPC: %*ph\n", - __func__, (int)rpclen, rqst->rq_svec[0].iov_base); -#endif - - req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); - req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; - req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); - - req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); - req->rl_send_iov[1].length = rpclen; - req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); - - req->rl_niovs = 2; + if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, + &rqst->rq_snd_buf, rpcrdma_noch)) + return -EIO; return 0; } @@ -402,7 +373,7 @@ out_overflow: out_short: pr_warn("RPC/RDMA short backward direction call\n"); - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) xprt_disconnect_done(xprt); else pr_warn("RPC: %s: reposting rep %p\n", diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 21cb3b150b37..1ebb09e1ac4f 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -160,9 +160,8 @@ static int fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { - rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, - RPCRDMA_MAX_DATA_SEGS / - RPCRDMA_MAX_FMR_SGES)); + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + RPCRDMA_MAX_FMR_SGES); return 0; } @@ -274,6 +273,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ list_for_each_entry(mw, &req->rl_registered, mw_list) list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); + r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) goto out_reset; @@ -331,4 +331,5 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_init_mr = fmr_op_init_mr, .ro_release_mr = fmr_op_release_mr, .ro_displayname = "fmr", + .ro_send_w_inv_ok = 0, }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 892b5e1d9b09..47bed5333c7f 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -44,18 +44,20 @@ * being done. * * When the underlying transport disconnects, MRs are left in one of - * three states: + * four states: * * INVALID: The MR was not in use before the QP entered ERROR state. - * (Or, the LOCAL_INV WR has not completed or flushed yet). - * - * STALE: The MR was being registered or unregistered when the QP - * entered ERROR state, and the pending WR was flushed. * * VALID: The MR was registered before the QP entered ERROR state. * - * When frwr_op_map encounters STALE and VALID MRs, they are recovered - * with ib_dereg_mr and then are re-initialized. Beause MR recovery + * FLUSHED_FR: The MR was being registered when the QP entered ERROR + * state, and the pending WR was flushed. + * + * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR + * state, and the pending WR was flushed. + * + * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered + * with ib_dereg_mr and then are re-initialized. Because MR recovery * allocates fresh resources, it is deferred to a workqueue, and the * recovered MRs are placed back on the rb_mws list when recovery is * complete. frwr_op_map allocates another MR for the current RPC while @@ -67,6 +69,8 @@ * pending send queue WRs before the transport is reconnected. */ +#include <linux/sunrpc/rpc_rdma.h> + #include "xprt_rdma.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -97,7 +101,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) struct rpcrdma_frmr *f = &r->frmr; int rc; - f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth); + f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); if (IS_ERR(f->fr_mr)) goto out_mr_err; @@ -153,7 +157,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) return rc; } - f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, + f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, ia->ri_max_frmr_depth); if (IS_ERR(f->fr_mr)) { pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", @@ -161,26 +165,25 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) return PTR_ERR(f->fr_mr); } - dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); + dprintk("RPC: %s: recovered FRMR %p\n", __func__, f); f->fr_state = FRMR_IS_INVALID; return 0; } /* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. - * - * There's no recovery if this fails. The FRMR is abandoned, but - * remains in rb_all. It will be cleaned up when the transport is - * destroyed. */ static void frwr_op_recover_mr(struct rpcrdma_mw *mw) { + enum rpcrdma_frmr_state state = mw->frmr.fr_state; struct rpcrdma_xprt *r_xprt = mw->mw_xprt; struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; rc = __frwr_reset_mr(ia, mw); - ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (state != FRMR_FLUSHED_LI) + ib_dma_unmap_sg(ia->ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); if (rc) goto out_release; @@ -203,11 +206,16 @@ static int frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { + struct ib_device_attr *attrs = &ia->ri_device->attrs; int depth, delta; + ia->ri_mrtype = IB_MR_TYPE_MEM_REG; + if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) + ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; + ia->ri_max_frmr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - ia->ri_device->attrs.max_fast_reg_page_list_len); + attrs->max_fast_reg_page_list_len); dprintk("RPC: %s: device's max FR page list len = %u\n", __func__, ia->ri_max_frmr_depth); @@ -234,17 +242,16 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) { - cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth; + if (ep->rep_attr.cap.max_send_wr > attrs->max_qp_wr) { + cdata->max_requests = attrs->max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; ep->rep_attr.cap.max_send_wr = cdata->max_requests * depth; } - rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, - RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frmr_depth)); + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + ia->ri_max_frmr_depth); return 0; } @@ -261,10 +268,8 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) } static void -__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_frmr *frmr, - const char *wr) +__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) { - frmr->fr_state = FRMR_IS_STALE; if (wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: %s: %s (%u/0x%x)\n", wr, ib_wc_status_msg(wc->status), @@ -287,7 +292,8 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) { cqe = wc->wr_cqe; frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - __frwr_sendcompletion_flush(wc, frmr, "fastreg"); + frmr->fr_state = FRMR_FLUSHED_FR; + __frwr_sendcompletion_flush(wc, "fastreg"); } } @@ -307,7 +313,8 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) { cqe = wc->wr_cqe; frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - __frwr_sendcompletion_flush(wc, frmr, "localinv"); + frmr->fr_state = FRMR_FLUSHED_LI; + __frwr_sendcompletion_flush(wc, "localinv"); } } @@ -327,9 +334,11 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ cqe = wc->wr_cqe; frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - if (wc->status != IB_WC_SUCCESS) - __frwr_sendcompletion_flush(wc, frmr, "localinv"); - complete_all(&frmr->fr_linv_done); + if (wc->status != IB_WC_SUCCESS) { + frmr->fr_state = FRMR_FLUSHED_LI; + __frwr_sendcompletion_flush(wc, "localinv"); + } + complete(&frmr->fr_linv_done); } /* Post a REG_MR Work Request to register a memory region @@ -340,6 +349,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, struct rpcrdma_mw **out) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; + bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; struct rpcrdma_mw *mw; struct rpcrdma_frmr *frmr; struct ib_mr *mr; @@ -375,8 +385,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, ++seg; ++i; - - /* Check for holes */ + if (holes_ok) + continue; if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; @@ -396,7 +406,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, goto out_mapmr_err; dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", - __func__, mw, mw->mw_nents, mr->length); + __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); @@ -413,7 +423,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; - DECR_CQCOUNT(&r_xprt->rx_ep); + rpcrdma_set_signaled(&r_xprt->rx_ep, ®_wr->wr); rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); if (rc) goto out_senderr; @@ -443,24 +453,6 @@ out_senderr: return -ENOTCONN; } -static struct ib_send_wr * -__frwr_prepare_linv_wr(struct rpcrdma_mw *mw) -{ - struct rpcrdma_frmr *f = &mw->frmr; - struct ib_send_wr *invalidate_wr; - - f->fr_state = FRMR_IS_INVALID; - invalidate_wr = &f->fr_invwr; - - memset(invalidate_wr, 0, sizeof(*invalidate_wr)); - f->fr_cqe.done = frwr_wc_localinv; - invalidate_wr->wr_cqe = &f->fr_cqe; - invalidate_wr->opcode = IB_WR_LOCAL_INV; - invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey; - - return invalidate_wr; -} - /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the @@ -471,11 +463,12 @@ __frwr_prepare_linv_wr(struct rpcrdma_mw *mw) static void frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; + struct ib_send_wr *first, **prev, *last, *bad_wr; + struct rpcrdma_rep *rep = req->rl_reply; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mw *mw, *tmp; struct rpcrdma_frmr *f; - int rc; + int count, rc; dprintk("RPC: %s: req %p\n", __func__, req); @@ -485,32 +478,53 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * a single ib_post_send() call. */ f = NULL; - invalidate_wrs = pos = prev = NULL; + count = 0; + prev = &first; list_for_each_entry(mw, &req->rl_registered, mw_list) { - pos = __frwr_prepare_linv_wr(mw); + mw->frmr.fr_state = FRMR_IS_INVALID; + + if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) && + (mw->mw_handle == rep->rr_inv_rkey)) + continue; - if (!invalidate_wrs) - invalidate_wrs = pos; - else - prev->next = pos; - prev = pos; f = &mw->frmr; + dprintk("RPC: %s: invalidating frmr %p\n", + __func__, f); + + f->fr_cqe.done = frwr_wc_localinv; + last = &f->fr_invwr; + memset(last, 0, sizeof(*last)); + last->wr_cqe = &f->fr_cqe; + last->opcode = IB_WR_LOCAL_INV; + last->ex.invalidate_rkey = mw->mw_handle; + count++; + + *prev = last; + prev = &last->next; } + if (!f) + goto unmap; /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain * are complete. */ - f->fr_invwr.send_flags = IB_SEND_SIGNALED; + last->send_flags = IB_SEND_SIGNALED; f->fr_cqe.done = frwr_wc_localinv_wake; reinit_completion(&f->fr_linv_done); - INIT_CQCOUNT(&r_xprt->rx_ep); + + /* Initialize CQ count, since there is always a signaled + * WR being posted here. The new cqcount depends on how + * many SQEs are about to be consumed. + */ + rpcrdma_init_cqcount(&r_xprt->rx_ep, count); /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us * unless ri_id->qp is a valid pointer. */ - rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); + r_xprt->rx_stats.local_inv_needed++; + rc = ib_post_send(ia->ri_id->qp, first, &bad_wr); if (rc) goto reset_mrs; @@ -521,6 +535,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ unmap: list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + dprintk("RPC: %s: DMA unmapping frmr %p\n", + __func__, &mw->frmr); list_del_init(&mw->mw_list); ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); @@ -537,7 +553,7 @@ reset_mrs: */ list_for_each_entry(mw, &req->rl_registered, mw_list) { f = &mw->frmr; - if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { + if (mw->mw_handle == bad_wr->ex.invalidate_rkey) { __frwr_reset_mr(ia, mw); bad_wr = bad_wr->next; } @@ -576,4 +592,5 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_init_mr = frwr_op_init_mr, .ro_release_mr = frwr_op_release_mr, .ro_displayname = "frwr", + .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK, }; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a47f170b20ef..c52e0f2ffe52 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,14 +53,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - static const char transfertypes[][12] = { "inline", /* no chunks */ "read list", /* some argument via rdma read */ @@ -118,10 +110,12 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) return size; } -void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia, - struct rpcrdma_create_data_internal *cdata, - unsigned int maxsegs) +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + unsigned int maxsegs = ia->ri_max_segs; + ia->ri_max_inline_write = cdata->inline_wsize - rpcrdma_max_call_header_size(maxsegs); ia->ri_max_inline_read = cdata->inline_rsize - @@ -155,42 +149,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; } -static int -rpcrdma_tail_pullup(struct xdr_buf *buf) -{ - size_t tlen = buf->tail[0].iov_len; - size_t skip = tlen & 3; - - /* Do not include the tail if it is only an XDR pad */ - if (tlen < 4) - return 0; - - /* xdr_write_pages() adds a pad at the beginning of the tail - * if the content in "buf->pages" is unaligned. Force the - * tail's actual content to land at the next XDR position - * after the head instead. - */ - if (skip) { - unsigned char *src, *dst; - unsigned int count; - - src = buf->tail[0].iov_base; - dst = buf->head[0].iov_base; - dst += buf->head[0].iov_len; - - src += skip; - tlen -= skip; - - dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n", - __func__, skip, dst, src, tlen); - - for (count = tlen; count; count--) - *dst++ = *src++; - } - - return tlen; -} - /* Split "vec" on page boundaries into segments. FMR registers pages, * not a byte range. Other modes coalesce these segments into a single * MR when they can. @@ -229,7 +187,8 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) static int rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, - enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, + bool reminv_expected) { int len, n, p, page_base; struct page **ppages; @@ -271,6 +230,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, if (type == rpcrdma_readch) return n; + /* When encoding the Write list, some servers need to see an extra + * segment for odd-length Write chunks. The upper layer provides + * space in the tail iovec for this purpose. + */ + if (type == rpcrdma_writech && reminv_expected) + return n; + if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing * xdr pad bytes, saving the server an RDMA operation. */ @@ -327,7 +293,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, if (rtype == rpcrdma_areadch) pos = 0; seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); + nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false); if (nsegs < 0) return ERR_PTR(nsegs); @@ -391,7 +357,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg); + wtype, seg, + r_xprt->rx_ia.ri_reminv_expected); if (nsegs < 0) return ERR_PTR(nsegs); @@ -456,7 +423,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, } seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, + r_xprt->rx_ia.ri_reminv_expected); if (nsegs < 0) return ERR_PTR(nsegs); @@ -491,74 +459,184 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, return iptr; } -/* - * Copy write data inline. - * This function is used for "small" requests. Data which is passed - * to RPC via iovecs (or page list) is copied directly into the - * pre-registered memory buffer for this request. For small amounts - * of data, this is efficient. The cutoff value is tunable. +/* Prepare the RPC-over-RDMA header SGE. + */ +static bool +rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, + u32 len) +{ + struct rpcrdma_regbuf *rb = req->rl_rdmabuf; + struct ib_sge *sge = &req->rl_send_sge[0]; + + if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) { + if (!__rpcrdma_dma_map_regbuf(ia, rb)) + return false; + sge->addr = rdmab_addr(rb); + sge->lkey = rdmab_lkey(rb); + } + sge->length = len; + + ib_dma_sync_single_for_device(ia->ri_device, sge->addr, + sge->length, DMA_TO_DEVICE); + req->rl_send_wr.num_sge++; + return true; +} + +/* Prepare the Send SGEs. The head and tail iovec, and each entry + * in the page list, gets its own SGE. */ -static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) +static bool +rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, + struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { - int i, npages, curlen; - int copy_len; - unsigned char *srcp, *destp; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); - int page_base; - struct page **ppages; + unsigned int sge_no, page_base, len, remaining; + struct rpcrdma_regbuf *rb = req->rl_sendbuf; + struct ib_device *device = ia->ri_device; + struct ib_sge *sge = req->rl_send_sge; + u32 lkey = ia->ri_pd->local_dma_lkey; + struct page *page, **ppages; + + /* The head iovec is straightforward, as it is already + * DMA-mapped. Sync the content that has changed. + */ + if (!rpcrdma_dma_map_regbuf(ia, rb)) + return false; + sge_no = 1; + sge[sge_no].addr = rdmab_addr(rb); + sge[sge_no].length = xdr->head[0].iov_len; + sge[sge_no].lkey = rdmab_lkey(rb); + ib_dma_sync_single_for_device(device, sge[sge_no].addr, + sge[sge_no].length, DMA_TO_DEVICE); + + /* If there is a Read chunk, the page list is being handled + * via explicit RDMA, and thus is skipped here. However, the + * tail iovec may include an XDR pad for the page list, as + * well as additional content, and may not reside in the + * same page as the head iovec. + */ + if (rtype == rpcrdma_readch) { + len = xdr->tail[0].iov_len; - destp = rqst->rq_svec[0].iov_base; - curlen = rqst->rq_svec[0].iov_len; - destp += curlen; + /* Do not include the tail if it is only an XDR pad */ + if (len < 4) + goto out; - dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n", - __func__, destp, rqst->rq_slen, curlen); + page = virt_to_page(xdr->tail[0].iov_base); + page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; - copy_len = rqst->rq_snd_buf.page_len; + /* If the content in the page list is an odd length, + * xdr_write_pages() has added a pad at the beginning + * of the tail iovec. Force the tail's non-pad content + * to land at the next XDR position in the Send message. + */ + page_base += len & 3; + len -= len & 3; + goto map_tail; + } - if (rqst->rq_snd_buf.tail[0].iov_len) { - curlen = rqst->rq_snd_buf.tail[0].iov_len; - if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { - memmove(destp + copy_len, - rqst->rq_snd_buf.tail[0].iov_base, curlen); - r_xprt->rx_stats.pullup_copy_count += curlen; + /* If there is a page list present, temporarily DMA map + * and prepare an SGE for each page to be sent. + */ + if (xdr->page_len) { + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = xdr->page_base & ~PAGE_MASK; + remaining = xdr->page_len; + while (remaining) { + sge_no++; + if (sge_no > RPCRDMA_MAX_SEND_SGES - 2) + goto out_mapping_overflow; + + len = min_t(u32, PAGE_SIZE - page_base, remaining); + sge[sge_no].addr = ib_dma_map_page(device, *ppages, + page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device, sge[sge_no].addr)) + goto out_mapping_err; + sge[sge_no].length = len; + sge[sge_no].lkey = lkey; + + req->rl_mapped_sges++; + ppages++; + remaining -= len; + page_base = 0; } - dprintk("RPC: %s: tail destp 0x%p len %d\n", - __func__, destp + copy_len, curlen); - rqst->rq_svec[0].iov_len += curlen; } - r_xprt->rx_stats.pullup_copy_count += copy_len; - page_base = rqst->rq_snd_buf.page_base; - ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT); - page_base &= ~PAGE_MASK; - npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT; - for (i = 0; copy_len && i < npages; i++) { - curlen = PAGE_SIZE - page_base; - if (curlen > copy_len) - curlen = copy_len; - dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", - __func__, i, destp, copy_len, curlen); - srcp = kmap_atomic(ppages[i]); - memcpy(destp, srcp+page_base, curlen); - kunmap_atomic(srcp); - rqst->rq_svec[0].iov_len += curlen; - destp += curlen; - copy_len -= curlen; - page_base = 0; + /* The tail iovec is not always constructed in the same + * page where the head iovec resides (see, for example, + * gss_wrap_req_priv). To neatly accommodate that case, + * DMA map it separately. + */ + if (xdr->tail[0].iov_len) { + page = virt_to_page(xdr->tail[0].iov_base); + page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + len = xdr->tail[0].iov_len; + +map_tail: + sge_no++; + sge[sge_no].addr = ib_dma_map_page(device, page, + page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device, sge[sge_no].addr)) + goto out_mapping_err; + sge[sge_no].length = len; + sge[sge_no].lkey = lkey; + req->rl_mapped_sges++; } - /* header now contains entire send message */ + +out: + req->rl_send_wr.num_sge = sge_no + 1; + return true; + +out_mapping_overflow: + pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); + return false; + +out_mapping_err: + pr_err("rpcrdma: Send mapping error\n"); + return false; +} + +bool +rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, + u32 hdrlen, struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype) +{ + req->rl_send_wr.num_sge = 0; + req->rl_mapped_sges = 0; + + if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen)) + goto out_map; + + if (rtype != rpcrdma_areadch) + if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype)) + goto out_map; + + return true; + +out_map: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; +} + +void +rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_device *device = ia->ri_device; + struct ib_sge *sge; + int count; + + sge = &req->rl_send_sge[2]; + for (count = req->rl_mapped_sges; count--; sge++) + ib_dma_unmap_page(device, sge->addr, sge->length, + DMA_TO_DEVICE); + req->rl_mapped_sges = 0; } /* * Marshal a request: the primary job of this routine is to choose * the transfer modes. See comments below. * - * Prepares up to two IOVs per Call message: - * - * [0] -- RPC RDMA header - * [1] -- the RPC header/data - * * Returns zero on success, otherwise a negative errno. */ @@ -626,12 +704,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rpcrdma_args_inline(r_xprt, rqst)) { rtype = rpcrdma_noch; - rpcrdma_inline_pullup(rqst); - rpclen = rqst->rq_svec[0].iov_len; + rpclen = rqst->rq_snd_buf.len; } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; - rpclen = rqst->rq_svec[0].iov_len; - rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); + rpclen = rqst->rq_snd_buf.head[0].iov_len + + rqst->rq_snd_buf.tail[0].iov_len; } else { r_xprt->rx_stats.nomsg_call_count++; headerp->rm_type = htonl(RDMA_NOMSG); @@ -673,34 +750,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) goto out_unmap; hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; - if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - goto out_overflow; - dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", rqst->rq_task->tk_pid, __func__, transfertypes[rtype], transfertypes[wtype], hdrlen, rpclen); - req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); - req->rl_send_iov[0].length = hdrlen; - req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); - - req->rl_niovs = 1; - if (rtype == rpcrdma_areadch) - return 0; - - req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); - req->rl_send_iov[1].length = rpclen; - req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); - - req->rl_niovs = 2; + if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, + &rqst->rq_snd_buf, rtype)) { + iptr = ERR_PTR(-EIO); + goto out_unmap; + } return 0; -out_overflow: - pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", - hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); - iptr = ERR_PTR(-EIO); - out_unmap: r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); return PTR_ERR(iptr); @@ -725,7 +786,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) ifdebug(FACILITY) { u64 off; xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); - dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", + dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n", __func__, be32_to_cpu(seg->rs_length), (unsigned long long)off, @@ -845,28 +906,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) return fixup_copy_count; } -void -rpcrdma_connect_worker(struct work_struct *work) -{ - struct rpcrdma_ep *ep = - container_of(work, struct rpcrdma_ep, rep_connect_worker.work); - struct rpcrdma_xprt *r_xprt = - container_of(ep, struct rpcrdma_xprt, rx_ep); - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - - spin_lock_bh(&xprt->transport_lock); - if (++xprt->connect_cookie == 0) /* maintain a reserved value */ - ++xprt->connect_cookie; - if (ep->rep_connected > 0) { - if (!xprt_test_and_set_connected(xprt)) - xprt_wake_pending_tasks(xprt, 0); - } else { - if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); - } - spin_unlock_bh(&xprt->transport_lock); -} - #if defined(CONFIG_SUNRPC_BACKCHANNEL) /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes @@ -898,26 +937,16 @@ rpcrdma_is_bcall(struct rpcrdma_msg *headerp) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* - * This function is called when an async event is posted to - * the connection which changes the connection state. All it - * does at this point is mark the connection up/down, the rpc - * timers do the rest. - */ -void -rpcrdma_conn_func(struct rpcrdma_ep *ep) -{ - schedule_delayed_work(&ep->rep_connect_worker, 0); -} - /* Process received RPC/RDMA messages. * * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. */ void -rpcrdma_reply_handler(struct rpcrdma_rep *rep) +rpcrdma_reply_handler(struct work_struct *work) { + struct rpcrdma_rep *rep = + container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_msg *headerp; struct rpcrdma_req *req; struct rpc_rqst *rqst; @@ -1132,6 +1161,6 @@ out_duplicate: repost: r_xprt->rx_stats.bad_reply_count++; - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) rpcrdma_recv_buffer_put(rep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a2a7519b0f23..288e35c2d8f4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -129,7 +129,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, ret = -EIO; goto out_unmap; } - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); memset(&send_wr, 0, sizeof(send_wr)); ctxt->cqe.done = svc_rdma_wc_send; @@ -159,35 +159,40 @@ out_unmap: /* Server-side transport endpoint wants a whole page for its send * buffer. The client RPC code constructs the RPC header in this * buffer before it invokes ->send_request. - * - * Returns NULL if there was a temporary allocation failure. */ -static void * -xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +static int +xprt_rdma_bc_allocate(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; - struct svcxprt_rdma *rdma; + size_t size = rqst->rq_callsize; struct page *page; - rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - - /* Prevent an infinite loop: try to make this case work */ - if (size > PAGE_SIZE) + if (size > PAGE_SIZE) { WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", size); + return -EINVAL; + } + /* svc_rdma_sendto releases this page */ page = alloc_page(RPCRDMA_DEF_GFP); if (!page) - return NULL; + return -ENOMEM; + rqst->rq_buffer = page_address(page); - return page_address(page); + rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP); + if (!rqst->rq_rbuffer) { + put_page(page); + return -ENOMEM; + } + return 0; } static void -xprt_rdma_bc_free(void *buffer) +xprt_rdma_bc_free(struct rpc_task *task) { - /* No-op: ctxt and page have already been freed. */ + struct rpc_rqst *rqst = task->tk_rqstp; + + kfree(rqst->rq_rbuffer); } static int @@ -350,6 +355,7 @@ xprt_setup_rdma_bc(struct xprt_create *args) out_fail: xprt_rdma_free_addresses(xprt); args->bc_xprt->xpt_bc_xprt = NULL; + args->bc_xprt->xpt_bc_xps = NULL; xprt_put(xprt); xprt_free(xprt); return ERR_PTR(-EINVAL); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 2c25606f2561..172b537f8cfc 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -159,7 +159,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, ctxt->sge[pno].addr); if (ret) goto err; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[pno].length = len; @@ -279,7 +279,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, frmr->sg); return -ENOMEM; } - atomic_inc(&xprt->sc_dma_used); n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); if (unlikely(n != frmr->sg_nents)) { @@ -348,8 +347,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, atomic_inc(&rdma_stat_read); return ret; err: - ib_dma_unmap_sg(xprt->sc_cm_id->device, - frmr->sg, frmr->sg_nents, frmr->direction); svc_rdma_put_context(ctxt, 0); svc_rdma_put_frmr(xprt, frmr); return ret; @@ -374,9 +371,7 @@ rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, u32 position, u32 byte_count, u32 page_offset, int page_no) { char *srcp, *destp; - int ret; - ret = 0; srcp = head->arg.head[0].iov_base + position; byte_count = head->arg.head[0].iov_len - position; if (byte_count > PAGE_SIZE) { @@ -415,6 +410,20 @@ done: return 1; } +/* Returns the address of the first read chunk or <nul> if no read chunk + * is present + */ +static struct rpcrdma_read_chunk * +svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *ch = + (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + + if (ch->rc_discrim == xdr_zero) + return NULL; + return ch; +} + static int rdma_read_chunks(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp, @@ -627,8 +636,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto defer; goto out; } - dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", - ctxt, rdma_xprt, rqstp, ctxt->wc_status); + dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n", + ctxt, rdma_xprt, rqstp); atomic_inc(&rdma_stat_recv); /* Build up the XDR from the receive buffers. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 54d533300620..ad4d286a83c5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -153,76 +153,68 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, return dma_addr; } -/* Returns the address of the first read chunk or <nul> if no read chunk - * is present +/* Parse the RPC Call's transport header. */ -struct rpcrdma_read_chunk * -svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, + struct rpcrdma_write_array **write, + struct rpcrdma_write_array **reply) { - struct rpcrdma_read_chunk *ch = - (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + __be32 *p; - if (ch->rc_discrim == xdr_zero) - return NULL; - return ch; -} + p = (__be32 *)&rmsgp->rm_body.rm_chunks[0]; -/* Returns the address of the first read write array element or <nul> - * if no write array list is present - */ -static struct rpcrdma_write_array * -svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) -{ - if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || - rmsgp->rm_body.rm_chunks[1] == xdr_zero) - return NULL; - return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; + /* Read list */ + while (*p++ != xdr_zero) + p += 5; + + /* Write list */ + if (*p != xdr_zero) { + *write = (struct rpcrdma_write_array *)p; + while (*p++ != xdr_zero) + p += 1 + be32_to_cpu(*p) * 4; + } else { + *write = NULL; + p++; + } + + /* Reply chunk */ + if (*p != xdr_zero) + *reply = (struct rpcrdma_write_array *)p; + else + *reply = NULL; } -/* Returns the address of the first reply array element or <nul> if no - * reply array is present +/* RPC-over-RDMA Version One private extension: Remote Invalidation. + * Responder's choice: requester signals it can handle Send With + * Invalidate, and responder chooses one rkey to invalidate. + * + * Find a candidate rkey to invalidate when sending a reply. Picks the + * first rkey it finds in the chunks lists. + * + * Returns zero if RPC's chunk lists are empty. */ -static struct rpcrdma_write_array * -svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp, - struct rpcrdma_write_array *wr_ary) +static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp, + struct rpcrdma_write_array *wr_ary, + struct rpcrdma_write_array *rp_ary) { - struct rpcrdma_read_chunk *rch; - struct rpcrdma_write_array *rp_ary; - - /* XXX: Need to fix when reply chunk may occur with read list - * and/or write list. - */ - if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || - rmsgp->rm_body.rm_chunks[1] != xdr_zero) - return NULL; - - rch = svc_rdma_get_read_chunk(rmsgp); - if (rch) { - while (rch->rc_discrim != xdr_zero) - rch++; - - /* The reply chunk follows an empty write array located - * at 'rc_position' here. The reply array is at rc_target. - */ - rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; - goto found_it; - } + struct rpcrdma_read_chunk *rd_ary; + struct rpcrdma_segment *arg_ch; - if (wr_ary) { - int chunk = be32_to_cpu(wr_ary->wc_nchunks); + rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0]; + if (rd_ary->rc_discrim != xdr_zero) + return be32_to_cpu(rd_ary->rc_target.rs_handle); - rp_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[chunk].wc_target.rs_length; - goto found_it; + if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) { + arg_ch = &wr_ary->wc_array[0].wc_target; + return be32_to_cpu(arg_ch->rs_handle); } - /* No read list, no write list */ - rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2]; + if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) { + arg_ch = &rp_ary->wc_array[0].wc_target; + return be32_to_cpu(arg_ch->rs_handle); + } - found_it: - if (rp_ary->wc_discrim == xdr_zero) - return NULL; - return rp_ary; + return 0; } /* Assumptions: @@ -280,7 +272,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge[sge_no].addr)) goto err; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count++; sge_off = 0; @@ -464,7 +456,8 @@ static int send_reply(struct svcxprt_rdma *rdma, struct page *page, struct rpcrdma_msg *rdma_resp, struct svc_rdma_req_map *vec, - int byte_count) + int byte_count, + u32 inv_rkey) { struct svc_rdma_op_ctxt *ctxt; struct ib_send_wr send_wr; @@ -489,7 +482,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[0].length, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) goto err; - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); ctxt->direction = DMA_TO_DEVICE; @@ -505,7 +498,7 @@ static int send_reply(struct svcxprt_rdma *rdma, if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[sge_no].addr)) goto err; - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } @@ -523,23 +516,9 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; ctxt->count++; rqstp->rq_respages[page_no] = NULL; - /* - * If there are more pages than SGE, terminate SGE - * list so that svc_rdma_unmap_dma doesn't attempt to - * unmap garbage. - */ - if (page_no+1 >= sge_no) - ctxt->sge[page_no+1].length = 0; } rqstp->rq_next_page = rqstp->rq_respages + 1; - /* The loop above bumps sc_dma_used for each sge. The - * xdr_buf.tail gets a separate sge, but resides in the - * same page as xdr_buf.head. Don't count it twice. - */ - if (sge_no > ctxt->count) - atomic_dec(&rdma->sc_dma_used); - if (sge_no > rdma->sc_max_sge) { pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err; @@ -549,7 +528,11 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.wr_cqe = &ctxt->cqe; send_wr.sg_list = ctxt->sge; send_wr.num_sge = sge_no; - send_wr.opcode = IB_WR_SEND; + if (inv_rkey) { + send_wr.opcode = IB_WR_SEND_WITH_INV; + send_wr.ex.invalidate_rkey = inv_rkey; + } else + send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; ret = svc_rdma_send(rdma, &send_wr); @@ -581,6 +564,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) int inline_bytes; struct page *res_page; struct svc_rdma_req_map *vec; + u32 inv_rkey; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -588,8 +572,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) * places this at the start of page 0. */ rdma_argp = page_address(rqstp->rq_pages[0]); - wr_ary = svc_rdma_get_write_array(rdma_argp); - rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary); + svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary); + + inv_rkey = 0; + if (rdma->sc_snd_w_inv) + inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary); /* Build an req vec for the XDR */ vec = svc_rdma_get_req_map(rdma); @@ -598,7 +585,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) goto err0; inline_bytes = rqstp->rq_res.len; - /* Create the RDMA response header */ + /* Create the RDMA response header. xprt->xpt_mutex, + * acquired in svc_send(), serializes RPC replies. The + * code path below that inserts the credit grant value + * into each transport header runs only inside this + * critical section. + */ ret = -ENOMEM; res_page = alloc_page(GFP_KERNEL); if (!res_page) @@ -633,9 +625,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) goto err1; ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, - inline_bytes); + inline_bytes, inv_rkey); if (ret < 0) - goto err1; + goto err0; svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: send_reply returns %d\n", ret); @@ -692,7 +684,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, svc_rdma_put_context(ctxt, 1); return; } - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); /* Prepare SEND WR */ memset(&err_wr, 0, sizeof(err_wr)); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index dd9440137834..39652d390a9c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -41,6 +41,7 @@ */ #include <linux/sunrpc/svc_xprt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/debug.h> #include <linux/sunrpc/rpc_rdma.h> #include <linux/interrupt.h> @@ -67,6 +68,7 @@ static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); static int svc_rdma_secure_port(struct svc_rqst *); +static void svc_rdma_kill_temp_xprt(struct svc_xprt *); static struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, @@ -79,6 +81,7 @@ static struct svc_xprt_ops svc_rdma_ops = { .xpo_has_wspace = svc_rdma_has_wspace, .xpo_accept = svc_rdma_accept, .xpo_secure_port = svc_rdma_secure_port, + .xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt, }; struct svc_xprt_class svc_rdma_class = { @@ -198,6 +201,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) out: ctxt->count = 0; + ctxt->mapped_sges = 0; ctxt->frmr = NULL; return ctxt; @@ -221,22 +225,24 @@ out_empty: void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { struct svcxprt_rdma *xprt = ctxt->xprt; - int i; - for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + struct ib_device *device = xprt->sc_cm_id->device; + u32 lkey = xprt->sc_pd->local_dma_lkey; + unsigned int i; + + for (i = 0; i < ctxt->mapped_sges; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { - atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_page(xprt->sc_cm_id->device, + if (ctxt->sge[i].lkey == lkey) + ib_dma_unmap_page(device, ctxt->sge[i].addr, ctxt->sge[i].length, ctxt->direction); - } } + ctxt->mapped_sges = 0; } void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) @@ -390,7 +396,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - ctxt->wc_status = wc->status; svc_rdma_unmap_dma(ctxt); if (wc->status != IB_WC_SUCCESS) @@ -428,7 +433,7 @@ static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt, goto err; out: - atomic_dec(&xprt->sc_sq_count); + atomic_inc(&xprt->sc_sq_avail); wake_up(&xprt->sc_send_wait); return; @@ -600,7 +605,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) goto err_put_ctxt; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; @@ -642,6 +647,26 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags) return ret; } +static void +svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt, + struct rdma_conn_param *param) +{ + const struct rpcrdma_connect_private *pmsg = param->private_data; + + if (pmsg && + pmsg->cp_magic == rpcrdma_cmp_magic && + pmsg->cp_version == RPCRDMA_CMP_VERSION) { + newxprt->sc_snd_w_inv = pmsg->cp_flags & + RPCRDMA_CMP_F_SND_W_INV_OK; + + dprintk("svcrdma: client send_size %u, recv_size %u " + "remote inv %ssupported\n", + rpcrdma_decode_buffer_size(pmsg->cp_send_size), + rpcrdma_decode_buffer_size(pmsg->cp_recv_size), + newxprt->sc_snd_w_inv ? "" : "un"); + } +} + /* * This function handles the CONNECT_REQUEST event on a listening * endpoint. It is passed the cma_id for the _new_ connection. The context in @@ -653,7 +678,8 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags) * will call the recvfrom method on the listen xprt which will accept the new * connection. */ -static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) +static void handle_connect_req(struct rdma_cm_id *new_cma_id, + struct rdma_conn_param *param) { struct svcxprt_rdma *listen_xprt = new_cma_id->context; struct svcxprt_rdma *newxprt; @@ -669,9 +695,10 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) new_cma_id->context = newxprt; dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", newxprt, newxprt->sc_cm_id, listen_xprt); + svc_rdma_parse_connect_private(newxprt, param); /* Save client advertised inbound read limit for use later in accept. */ - newxprt->sc_ord = client_ird; + newxprt->sc_ord = param->initiator_depth; /* Set the local and remote addresses in the transport */ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; @@ -706,8 +733,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " "event = %s (%d)\n", cma_id, cma_id->context, rdma_event_msg(event->event), event->event); - handle_connect_req(cma_id, - event->param.conn.initiator_depth); + handle_connect_req(cma_id, &event->param.conn); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -917,7 +943,6 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, if (frmr) { ib_dma_unmap_sg(rdma->sc_cm_id->device, frmr->sg, frmr->sg_nents, frmr->direction); - atomic_dec(&rdma->sc_dma_used); spin_lock_bh(&rdma->sc_frmr_q_lock); WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); @@ -941,8 +966,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; + struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; struct ib_device *dev; + struct sockaddr *sap; unsigned int i; int ret = 0; @@ -980,6 +1007,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; + atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); if (!svc_rdma_prealloc_ctxts(newxprt)) goto errout; @@ -993,7 +1021,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord); newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); - newxprt->sc_pd = ib_alloc_pd(dev); + newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { dprintk("svcrdma: error creating PD for connect request\n"); goto errout; @@ -1022,18 +1050,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = newxprt->sc_sq_cq; qp_attr.recv_cq = newxprt->sc_rq_cq; - dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n" - " cm_id->device=%p, sc_pd->device=%p\n" - " cap.max_send_wr = %d\n" - " cap.max_recv_wr = %d\n" - " cap.max_send_sge = %d\n" - " cap.max_recv_sge = %d\n", - newxprt->sc_cm_id, newxprt->sc_pd, - dev, newxprt->sc_pd->device, - qp_attr.cap.max_send_wr, - qp_attr.cap.max_recv_wr, - qp_attr.cap.max_send_sge, - qp_attr.cap.max_recv_sge); + dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n", + newxprt->sc_cm_id, newxprt->sc_pd); + dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n", + qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr); + dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n", + qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge); ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { @@ -1070,7 +1092,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dev->attrs.max_fast_reg_page_list_len; newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; newxprt->sc_reader = rdma_read_chunk_frmr; - } + } else + newxprt->sc_snd_w_inv = false; /* * Determine if a DMA MR is required and if so, what privs are required @@ -1094,11 +1117,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Swap out the handler */ newxprt->sc_cm_id->event_handler = rdma_cma_handler; + /* Construct RDMA-CM private message */ + pmsg.cp_magic = rpcrdma_cmp_magic; + pmsg.cp_version = RPCRDMA_CMP_VERSION; + pmsg.cp_flags = 0; + pmsg.cp_send_size = pmsg.cp_recv_size = + rpcrdma_encode_buffer_size(newxprt->sc_max_req_size); + /* Accept Connection */ set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 0; conn_param.initiator_depth = newxprt->sc_ord; + conn_param.private_data = &pmsg; + conn_param.private_data_len = sizeof(pmsg); ret = rdma_accept(newxprt->sc_cm_id, &conn_param); if (ret) { dprintk("svcrdma: failed to accept new connection, ret=%d\n", @@ -1106,31 +1138,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } - dprintk("svcrdma: new connection %p accepted with the following " - "attributes:\n" - " local_ip : %pI4\n" - " local_port : %d\n" - " remote_ip : %pI4\n" - " remote_port : %d\n" - " max_sge : %d\n" - " max_sge_rd : %d\n" - " sq_depth : %d\n" - " max_requests : %d\n" - " ord : %d\n", - newxprt, - &((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.src_addr)->sin_addr.s_addr, - ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.src_addr)->sin_port), - &((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.dst_addr)->sin_addr.s_addr, - ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.dst_addr)->sin_port), - newxprt->sc_max_sge, - newxprt->sc_max_sge_rd, - newxprt->sc_sq_depth, - newxprt->sc_max_requests, - newxprt->sc_ord); + dprintk("svcrdma: new connection %p accepted:\n", newxprt); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); + dprintk(" max_sge : %d\n", newxprt->sc_max_sge); + dprintk(" max_sge_rd : %d\n", newxprt->sc_max_sge_rd); + dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); + dprintk(" max_requests : %d\n", newxprt->sc_max_requests); + dprintk(" ord : %d\n", newxprt->sc_ord); return &newxprt->sc_xprt; @@ -1184,9 +1201,9 @@ static void __svc_rdma_free(struct work_struct *work) ib_drain_qp(rdma->sc_qp); /* We should only be called from kref_put */ - if (atomic_read(&xprt->xpt_ref.refcount) != 0) + if (kref_read(&xprt->xpt_ref) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", - atomic_read(&xprt->xpt_ref.refcount)); + kref_read(&xprt->xpt_ref)); /* * Destroy queued, but not processed read completions. Note @@ -1217,9 +1234,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_ctxt_used != 0) pr_err("svcrdma: ctxt still in use? (%d)\n", rdma->sc_ctxt_used); - if (atomic_read(&rdma->sc_dma_used) != 0) - pr_err("svcrdma: dma still in use? (%d)\n", - atomic_read(&rdma->sc_dma_used)); /* Final put of backchannel client transport */ if (xprt->xpt_bc_xprt) { @@ -1279,6 +1293,10 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp) return 1; } +static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) +{ +} + int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { struct ib_send_wr *bad_wr, *n_wr; @@ -1295,15 +1313,13 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) /* If the SQ is full, wait until an SQ entry is available */ while (1) { - spin_lock_bh(&xprt->sc_lock); - if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { - spin_unlock_bh(&xprt->sc_lock); + if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) { atomic_inc(&rdma_stat_sq_starve); /* Wait until SQ WR available if SQ still full */ + atomic_add(wr_count, &xprt->sc_sq_avail); wait_event(xprt->sc_send_wait, - atomic_read(&xprt->sc_sq_count) < - xprt->sc_sq_depth); + atomic_read(&xprt->sc_sq_avail) > wr_count); if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) return -ENOTCONN; continue; @@ -1313,21 +1329,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) svc_xprt_get(&xprt->sc_xprt); /* Bump used SQ WR count and post */ - atomic_add(wr_count, &xprt->sc_sq_count); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); if (ret) { set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - atomic_sub(wr_count, &xprt->sc_sq_count); for (i = 0; i < wr_count; i ++) svc_xprt_put(&xprt->sc_xprt); - dprintk("svcrdma: failed to post SQ WR rc=%d, " - "sc_sq_count=%d, sc_sq_depth=%d\n", - ret, atomic_read(&xprt->sc_sq_count), - xprt->sc_sq_depth); - } - spin_unlock_bh(&xprt->sc_lock); - if (ret) + dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret); + dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n", + atomic_read(&xprt->sc_sq_avail), + xprt->sc_sq_depth); wake_up(&xprt->sc_send_wait); + } break; } return ret; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 81f0e879f019..534c178d2a7e 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = { .data = &xprt_rdma_max_inline_read, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_inline_size, .extra2 = &max_inline_size, }, @@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = { .data = &xprt_rdma_max_inline_write, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_inline_size, .extra2 = &max_inline_size, }, @@ -219,6 +219,34 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt) } } +void +rpcrdma_conn_func(struct rpcrdma_ep *ep) +{ + schedule_delayed_work(&ep->rep_connect_worker, 0); +} + +void +rpcrdma_connect_worker(struct work_struct *work) +{ + struct rpcrdma_ep *ep = + container_of(work, struct rpcrdma_ep, rep_connect_worker.work); + struct rpcrdma_xprt *r_xprt = + container_of(ep, struct rpcrdma_xprt, rx_ep); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + spin_lock_bh(&xprt->transport_lock); + if (++xprt->connect_cookie == 0) /* maintain a reserved value */ + ++xprt->connect_cookie; + if (ep->rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) + xprt_wake_pending_tasks(xprt, 0); + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, -ENOTCONN); + } + spin_unlock_bh(&xprt->transport_lock); +} + static void xprt_rdma_connect_worker(struct work_struct *work) { @@ -477,115 +505,153 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -/* - * The RDMA allocate/free functions need the task structure as a place - * to hide the struct rpcrdma_req, which is necessary for the actual send/recv - * sequence. +/* Allocate a fixed-size buffer in which to construct and send the + * RPC-over-RDMA header for this request. + */ +static bool +rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + gfp_t flags) +{ + size_t size = RPCRDMA_HDRBUF_SIZE; + struct rpcrdma_regbuf *rb; + + if (req->rl_rdmabuf) + return true; + + rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); + if (IS_ERR(rb)) + return false; + + r_xprt->rx_stats.hardway_register_count += size; + req->rl_rdmabuf = rb; + return true; +} + +static bool +rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + size_t size, gfp_t flags) +{ + struct rpcrdma_regbuf *rb; + + if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size) + return true; + + rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); + if (IS_ERR(rb)) + return false; + + rpcrdma_free_regbuf(req->rl_sendbuf); + r_xprt->rx_stats.hardway_register_count += size; + req->rl_sendbuf = rb; + return true; +} + +/* The rq_rcv_buf is used only if a Reply chunk is necessary. + * The decision to use a Reply chunk is made later in + * rpcrdma_marshal_req. This buffer is registered at that time. + * + * Otherwise, the associated RPC Reply arrives in a separate + * Receive buffer, arbitrarily chosen by the HCA. The buffer + * allocated here for the RPC Reply is not utilized in that + * case. See rpcrdma_inline_fixup. * - * The RPC layer allocates both send and receive buffers in the same call - * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). - * We may register rq_rcv_buf when using reply chunks. + * A regbuf is used here to remember the buffer size. */ -static void * -xprt_rdma_allocate(struct rpc_task *task, size_t size) +static bool +rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + size_t size, gfp_t flags) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_regbuf *rb; + + if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size) + return true; + + rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags); + if (IS_ERR(rb)) + return false; + + rpcrdma_free_regbuf(req->rl_recvbuf); + r_xprt->rx_stats.hardway_register_count += size; + req->rl_recvbuf = rb; + return true; +} + +/** + * xprt_rdma_allocate - allocate transport resources for an RPC + * @task: RPC task + * + * Return values: + * 0: Success; rq_buffer points to RPC buffer to use + * ENOMEM: Out of memory, call again later + * EIO: A permanent error occurred, do not retry + * + * The RDMA allocate/free functions need the task structure as a place + * to hide the struct rpcrdma_req, which is necessary for the actual + * send/recv sequence. + * + * xprt_rdma_allocate provides buffers that are already mapped for + * DMA, and a local DMA lkey is provided for each. + */ +static int +xprt_rdma_allocate(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req; - size_t min_size; gfp_t flags; req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) - return NULL; + return -ENOMEM; flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; - if (req->rl_rdmabuf == NULL) - goto out_rdmabuf; - if (req->rl_sendbuf == NULL) - goto out_sendbuf; - if (size > req->rl_sendbuf->rg_size) - goto out_sendbuf; - -out: - dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); - req->rl_connect_cookie = 0; /* our reserved value */ - req->rl_task = task; - return req->rl_sendbuf->rg_base; - -out_rdmabuf: - min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); - rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags); - if (IS_ERR(rb)) + if (!rpcrdma_get_rdmabuf(r_xprt, req, flags)) goto out_fail; - req->rl_rdmabuf = rb; - -out_sendbuf: - /* XDR encoding and RPC/RDMA marshaling of this request has not - * yet occurred. Thus a lower bound is needed to prevent buffer - * overrun during marshaling. - * - * RPC/RDMA marshaling may choose to send payload bearing ops - * inline, if the result is smaller than the inline threshold. - * The value of the "size" argument accounts for header - * requirements but not for the payload in these cases. - * - * Likewise, allocate enough space to receive a reply up to the - * size of the inline threshold. - * - * It's unlikely that both the send header and the received - * reply will be large, but slush is provided here to allow - * flexibility when marshaling. - */ - min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp); - min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); - if (size < min_size) - size = min_size; - - rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); - if (IS_ERR(rb)) + if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags)) + goto out_fail; + if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - rb->rg_owner = req; - r_xprt->rx_stats.hardway_register_count += size; - rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); - req->rl_sendbuf = rb; - goto out; + dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n", + task->tk_pid, __func__, rqst->rq_callsize, + rqst->rq_rcvsize, req); + + req->rl_connect_cookie = 0; /* our reserved value */ + rpcrdma_set_xprtdata(rqst, req); + rqst->rq_buffer = req->rl_sendbuf->rg_base; + rqst->rq_rbuffer = req->rl_recvbuf->rg_base; + return 0; out_fail: rpcrdma_buffer_put(req); - return NULL; + return -ENOMEM; } -/* - * This function returns all RDMA resources to the pool. +/** + * xprt_rdma_free - release resources allocated by xprt_rdma_allocate + * @task: RPC task + * + * Caller guarantees rqst->rq_buffer is non-NULL. */ static void -xprt_rdma_free(void *buffer) +xprt_rdma_free(struct rpc_task *task) { - struct rpcrdma_req *req; - struct rpcrdma_xprt *r_xprt; - struct rpcrdma_regbuf *rb; - - if (buffer == NULL) - return; + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_ia *ia = &r_xprt->rx_ia; - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); - req = rb->rg_owner; if (req->rl_backchannel) return; - r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); - dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, - !RPC_IS_ASYNC(req->rl_task)); - + if (unlikely(!list_empty(&req->rl_registered))) + ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); + rpcrdma_unmap_sges(ia, req); rpcrdma_buffer_put(req); } @@ -620,7 +686,8 @@ xprt_rdma_send_request(struct rpc_task *task) int rc = 0; /* On retransmit, remove any previously registered chunks */ - r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); + if (unlikely(!list_empty(&req->rl_registered))) + r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); rc = rpcrdma_marshal_req(rqst); if (rc < 0) @@ -685,10 +752,11 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); - seq_printf(seq, "%lu %lu %lu\n", + seq_printf(seq, "%lu %lu %lu %lu\n", r_xprt->rx_stats.mrs_recovered, r_xprt->rx_stats.mrs_orphaned, - r_xprt->rx_stats.mrs_allocated); + r_xprt->rx_stats.mrs_allocated, + r_xprt->rx_stats.local_inv_needed); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 799cce6cbe45..11d07748f699 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -103,9 +103,9 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; - pr_err("RPC: %s: %s on device %s ep %p\n", - __func__, ib_event_msg(event->event), - event->device->name, context); + pr_err("rpcrdma: %s on device %s ep %p\n", + ib_event_msg(event->event), event->device->name, context); + if (ep->rep_connected == 1) { ep->rep_connected = -EIO; rpcrdma_conn_func(ep); @@ -129,15 +129,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) wc->status, wc->vendor_err); } -static void -rpcrdma_receive_worker(struct work_struct *work) -{ - struct rpcrdma_rep *rep = - container_of(work, struct rpcrdma_rep, rr_work); - - rpcrdma_reply_handler(rep); -} - /* Perform basic sanity checking to avoid using garbage * to update the credit grant value. */ @@ -161,13 +152,13 @@ rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) } /** - * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC + * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC * @cq: completion queue (ignored) * @wc: completed WR * */ static void -rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc) +rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, @@ -185,6 +176,9 @@ rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc) __func__, rep, wc->byte_len); rep->rr_len = wc->byte_len; + rep->rr_wc_flags = wc->wc_flags; + rep->rr_inv_rkey = wc->ex.invalidate_rkey; + ib_dma_sync_single_for_cpu(rep->rr_device, rdmab_addr(rep->rr_rdmabuf), rep->rr_len, DMA_FROM_DEVICE); @@ -204,6 +198,36 @@ out_fail: goto out_schedule; } +static void +rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, + struct rdma_conn_param *param) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + const struct rpcrdma_connect_private *pmsg = param->private_data; + unsigned int rsize, wsize; + + /* Default settings for RPC-over-RDMA Version One */ + r_xprt->rx_ia.ri_reminv_expected = false; + rsize = RPCRDMA_V1_DEF_INLINE_SIZE; + wsize = RPCRDMA_V1_DEF_INLINE_SIZE; + + if (pmsg && + pmsg->cp_magic == rpcrdma_cmp_magic && + pmsg->cp_version == RPCRDMA_CMP_VERSION) { + r_xprt->rx_ia.ri_reminv_expected = true; + rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); + wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); + } + + if (rsize < cdata->inline_rsize) + cdata->inline_rsize = rsize; + if (wsize < cdata->inline_wsize) + cdata->inline_wsize = wsize; + dprintk("RPC: %s: max send %u, max recv %u\n", + __func__, cdata->inline_wsize, cdata->inline_rsize); + rpcrdma_set_max_header_sizes(r_xprt); +} + static int rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -244,6 +268,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) " (%d initiator)\n", __func__, attr->max_dest_rd_atomic, attr->max_rd_atomic); + rpcrdma_update_connect_private(xprt, &event->param.conn); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: connstate = -ENOTCONN; @@ -306,6 +331,7 @@ static struct rdma_cm_id * rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia, struct sockaddr *addr) { + unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; struct rdma_cm_id *id; int rc; @@ -327,8 +353,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, __func__, rc); goto out; } - wait_for_completion_interruptible_timeout(&ia->ri_done, - msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + if (rc < 0) { + dprintk("RPC: %s: wait() exited: %i\n", + __func__, rc); + goto out; + } /* FIXME: * Until xprtrdma supports DEVICE_REMOVAL, the provider must @@ -351,8 +381,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, __func__, rc); goto put; } - wait_for_completion_interruptible_timeout(&ia->ri_done, - msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + if (rc < 0) { + dprintk("RPC: %s: wait() exited: %i\n", + __func__, rc); + goto put; + } rc = ia->ri_async_rc; if (rc) goto put; @@ -387,7 +421,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } ia->ri_device = ia->ri_id->device; - ia->ri_pd = ib_alloc_pd(ia->ri_device); + ia->ri_pd = ib_alloc_pd(ia->ri_device, 0); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); @@ -454,11 +488,12 @@ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { + struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; struct ib_cq *sendcq, *recvcq; unsigned int max_qp_wr; int rc; - if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) { + if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) { dprintk("RPC: %s: insufficient sge's available\n", __func__); return -ENOMEM; @@ -487,7 +522,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ - ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; + ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -506,7 +541,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; if (ep->rep_cqinit <= 2) ep->rep_cqinit = 0; /* always signal? */ - INIT_CQCOUNT(ep); + rpcrdma_init_cqcount(ep, 0); init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); @@ -536,9 +571,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Initialize cma parameters */ memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); - /* RPC/RDMA does not use private data */ - ep->rep_remote_cma.private_data = NULL; - ep->rep_remote_cma.private_data_len = 0; + /* Prepare RDMA-CM private message */ + pmsg->cp_magic = rpcrdma_cmp_magic; + pmsg->cp_version = RPCRDMA_CMP_VERSION; + pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok; + pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize); + ep->rep_remote_cma.private_data = pmsg; + ep->rep_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; @@ -849,6 +889,10 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) req->rl_cqe.done = rpcrdma_wc_send; req->rl_buffer = &r_xprt->rx_buf; INIT_LIST_HEAD(&req->rl_registered); + req->rl_send_wr.next = NULL; + req->rl_send_wr.wr_cqe = &req->rl_cqe; + req->rl_send_wr.sg_list = req->rl_send_sge; + req->rl_send_wr.opcode = IB_WR_SEND; return req; } @@ -865,17 +909,21 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize, - GFP_KERNEL); + rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize, + DMA_FROM_DEVICE, GFP_KERNEL); if (IS_ERR(rep->rr_rdmabuf)) { rc = PTR_ERR(rep->rr_rdmabuf); goto out_free; } rep->rr_device = ia->ri_device; - rep->rr_cqe.done = rpcrdma_receive_wc; + rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; - INIT_WORK(&rep->rr_work, rpcrdma_receive_worker); + INIT_WORK(&rep->rr_work, rpcrdma_reply_handler); + rep->rr_recv_wr.next = NULL; + rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; + rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; + rep->rr_recv_wr.num_sge = 1; return rep; out_free: @@ -966,17 +1014,18 @@ rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf) } static void -rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) +rpcrdma_destroy_rep(struct rpcrdma_rep *rep) { - rpcrdma_free_regbuf(ia, rep->rr_rdmabuf); + rpcrdma_free_regbuf(rep->rr_rdmabuf); kfree(rep); } void -rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +rpcrdma_destroy_req(struct rpcrdma_req *req) { - rpcrdma_free_regbuf(ia, req->rl_sendbuf); - rpcrdma_free_regbuf(ia, req->rl_rdmabuf); + rpcrdma_free_regbuf(req->rl_recvbuf); + rpcrdma_free_regbuf(req->rl_sendbuf); + rpcrdma_free_regbuf(req->rl_rdmabuf); kfree(req); } @@ -1009,15 +1058,13 @@ rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - struct rpcrdma_ia *ia = rdmab_to_ia(buf); - cancel_delayed_work_sync(&buf->rb_recovery_worker); while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; rep = rpcrdma_buffer_get_rep_locked(buf); - rpcrdma_destroy_rep(ia, rep); + rpcrdma_destroy_rep(rep); } buf->rb_send_count = 0; @@ -1030,7 +1077,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) list_del(&req->rl_all); spin_unlock(&buf->rb_reqslock); - rpcrdma_destroy_req(ia, req); + rpcrdma_destroy_req(req); spin_lock(&buf->rb_reqslock); } spin_unlock(&buf->rb_reqslock); @@ -1129,7 +1176,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) struct rpcrdma_buffer *buffers = req->rl_buffer; struct rpcrdma_rep *rep = req->rl_reply; - req->rl_niovs = 0; + req->rl_send_wr.num_sge = 0; req->rl_reply = NULL; spin_lock(&buffers->rb_lock); @@ -1171,70 +1218,81 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) spin_unlock(&buffers->rb_lock); } -/* - * Wrappers for internal-use kmalloc memory registration, used by buffer code. - */ - /** - * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers - * @ia: controlling rpcrdma_ia + * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers * @size: size of buffer to be allocated, in bytes + * @direction: direction of data movement * @flags: GFP flags * - * Returns pointer to private header of an area of internally - * registered memory, or an ERR_PTR. The registered buffer follows - * the end of the private header. + * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that + * can be persistently DMA-mapped for I/O. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for - * receiving the payload of RDMA RECV operations. regbufs are not - * used for RDMA READ/WRITE operations, thus are registered only for - * LOCAL access. + * receiving the payload of RDMA RECV operations. During Long Calls + * or Replies they may be registered externally via ro_map. */ struct rpcrdma_regbuf * -rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) +rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, + gfp_t flags) { struct rpcrdma_regbuf *rb; - struct ib_sge *iov; rb = kmalloc(sizeof(*rb) + size, flags); if (rb == NULL) - goto out; + return ERR_PTR(-ENOMEM); - iov = &rb->rg_iov; - iov->addr = ib_dma_map_single(ia->ri_device, - (void *)rb->rg_base, size, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(ia->ri_device, iov->addr)) - goto out_free; + rb->rg_device = NULL; + rb->rg_direction = direction; + rb->rg_iov.length = size; - iov->length = size; - iov->lkey = ia->ri_pd->local_dma_lkey; - rb->rg_size = size; - rb->rg_owner = NULL; return rb; +} -out_free: - kfree(rb); -out: - return ERR_PTR(-ENOMEM); +/** + * __rpcrdma_map_regbuf - DMA-map a regbuf + * @ia: controlling rpcrdma_ia + * @rb: regbuf to be mapped + */ +bool +__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +{ + if (rb->rg_direction == DMA_NONE) + return false; + + rb->rg_iov.addr = ib_dma_map_single(ia->ri_device, + (void *)rb->rg_base, + rdmab_length(rb), + rb->rg_direction); + if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb))) + return false; + + rb->rg_device = ia->ri_device; + rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; + return true; +} + +static void +rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb) +{ + if (!rpcrdma_regbuf_is_mapped(rb)) + return; + + ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), + rdmab_length(rb), rb->rg_direction); + rb->rg_device = NULL; } /** * rpcrdma_free_regbuf - deregister and free registered buffer - * @ia: controlling rpcrdma_ia * @rb: regbuf to be deregistered and freed */ void -rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb) { - struct ib_sge *iov; - if (!rb) return; - iov = &rb->rg_iov; - ib_dma_unmap_single(ia->ri_device, - iov->addr, iov->length, DMA_BIDIRECTIONAL); + rpcrdma_dma_unmap_regbuf(rb); kfree(rb); } @@ -1248,39 +1306,22 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { - struct ib_device *device = ia->ri_device; - struct ib_send_wr send_wr, *send_wr_fail; - struct rpcrdma_rep *rep = req->rl_reply; - struct ib_sge *iov = req->rl_send_iov; - int i, rc; + struct ib_send_wr *send_wr = &req->rl_send_wr; + struct ib_send_wr *send_wr_fail; + int rc; - if (rep) { - rc = rpcrdma_ep_post_recv(ia, ep, rep); + if (req->rl_reply) { + rc = rpcrdma_ep_post_recv(ia, req->rl_reply); if (rc) return rc; req->rl_reply = NULL; } - send_wr.next = NULL; - send_wr.wr_cqe = &req->rl_cqe; - send_wr.sg_list = iov; - send_wr.num_sge = req->rl_niovs; - send_wr.opcode = IB_WR_SEND; - - for (i = 0; i < send_wr.num_sge; i++) - ib_dma_sync_single_for_device(device, iov[i].addr, - iov[i].length, DMA_TO_DEVICE); dprintk("RPC: %s: posting %d s/g entries\n", - __func__, send_wr.num_sge); - - if (DECR_CQCOUNT(ep) > 0) - send_wr.send_flags = 0; - else { /* Provider must take a send completion every now and then */ - INIT_CQCOUNT(ep); - send_wr.send_flags = IB_SEND_SIGNALED; - } + __func__, send_wr->num_sge); - rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); + rpcrdma_set_signaled(ep, send_wr); + rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); if (rc) goto out_postsend_err; return 0; @@ -1290,32 +1331,24 @@ out_postsend_err: return -ENOTCONN; } -/* - * (Re)post a receive buffer. - */ int rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, - struct rpcrdma_ep *ep, struct rpcrdma_rep *rep) { - struct ib_recv_wr recv_wr, *recv_wr_fail; + struct ib_recv_wr *recv_wr_fail; int rc; - recv_wr.next = NULL; - recv_wr.wr_cqe = &rep->rr_cqe; - recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; - recv_wr.num_sge = 1; - - ib_dma_sync_single_for_cpu(ia->ri_device, - rdmab_addr(rep->rr_rdmabuf), - rdmab_length(rep->rr_rdmabuf), - DMA_BIDIRECTIONAL); - - rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); + if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf)) + goto out_map; + rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail); if (rc) goto out_postrecv; return 0; +out_map: + pr_err("rpcrdma: failed to DMA map the Receive buffer\n"); + return -EIO; + out_postrecv: pr_err("rpcrdma: ib_post_recv returned %i\n", rc); return -ENOTCONN; @@ -1333,7 +1366,6 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) { struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_rep *rep; int rc; @@ -1344,7 +1376,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) rep = rpcrdma_buffer_get_rep_locked(buffers); spin_unlock(&buffers->rb_lock); - rc = rpcrdma_ep_post_recv(ia, ep, rep); + rc = rpcrdma_ep_post_recv(ia, rep); if (rc) goto out_rc; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index a71b0f5897d8..e35efd4ac1e4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -70,9 +70,12 @@ struct rpcrdma_ia { struct ib_pd *ri_pd; struct completion ri_done; int ri_async_rc; + unsigned int ri_max_segs; unsigned int ri_max_frmr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; + bool ri_reminv_expected; + enum ib_mr_type ri_mrtype; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; @@ -87,13 +90,30 @@ struct rpcrdma_ep { int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; + struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; }; -#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) -#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +static inline void +rpcrdma_init_cqcount(struct rpcrdma_ep *ep, int count) +{ + atomic_set(&ep->rep_cqcount, ep->rep_cqinit - count); +} + +/* To update send queue accounting, provider must take a + * send completion every now and then. + */ +static inline void +rpcrdma_set_signaled(struct rpcrdma_ep *ep, struct ib_send_wr *send_wr) +{ + send_wr->send_flags = 0; + if (unlikely(atomic_sub_return(1, &ep->rep_cqcount) <= 0)) { + rpcrdma_init_cqcount(ep, 0); + send_wr->send_flags = IB_SEND_SIGNALED; + } +} /* Pre-allocate extra Work Requests for handling backward receives * and sends. This is a fixed value because the Work Queues are @@ -112,9 +132,9 @@ struct rpcrdma_ep { */ struct rpcrdma_regbuf { - size_t rg_size; - struct rpcrdma_req *rg_owner; struct ib_sge rg_iov; + struct ib_device *rg_device; + enum dma_data_direction rg_direction; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -162,7 +182,10 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * The smallest inline threshold is 1024 bytes, ensuring that * at least 750 bytes are available for RPC messages. */ -#define RPCRDMA_MAX_HDR_SEGS (8) +enum { + RPCRDMA_MAX_HDR_SEGS = 8, + RPCRDMA_HDRBUF_SIZE = 256, +}; /* * struct rpcrdma_rep -- this structure encapsulates state required to recv @@ -182,10 +205,13 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) struct rpcrdma_rep { struct ib_cqe rr_cqe; unsigned int rr_len; + int rr_wc_flags; + u32 rr_inv_rkey; struct ib_device *rr_device; struct rpcrdma_xprt *rr_rxprt; struct work_struct rr_work; struct list_head rr_list; + struct ib_recv_wr rr_recv_wr; struct rpcrdma_regbuf *rr_rdmabuf; }; @@ -207,7 +233,8 @@ struct rpcrdma_rep { enum rpcrdma_frmr_state { FRMR_IS_INVALID, /* ready to be used */ FRMR_IS_VALID, /* in use */ - FRMR_IS_STALE, /* failed completion */ + FRMR_FLUSHED_FR, /* flushed FASTREG WR */ + FRMR_FLUSHED_LI, /* flushed LOCALINV WR */ }; struct rpcrdma_frmr { @@ -276,19 +303,30 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; -#define RPCRDMA_MAX_IOVS (2) +/* Reserve enough Send SGEs to send a maximum size inline request: + * - RPC-over-RDMA header + * - xdr_buf head iovec + * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages + * - xdr_buf tail iovec + */ +enum { + RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1, + RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1, + RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1, +}; struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_free; - unsigned int rl_niovs; + unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; - struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; - struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ - struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; - struct rpcrdma_regbuf *rl_rdmabuf; - struct rpcrdma_regbuf *rl_sendbuf; + struct rpcrdma_rep *rl_reply; + struct ib_send_wr rl_send_wr; + struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; + struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ + struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ + struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ struct ib_cqe rl_cqe; struct list_head rl_all; @@ -298,14 +336,16 @@ struct rpcrdma_req { struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; +static inline void +rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) +{ + rqst->rq_xprtdata = req; +} + static inline struct rpcrdma_req * rpcr_to_rdmar(struct rpc_rqst *rqst) { - void *buffer = rqst->rq_buffer; - struct rpcrdma_regbuf *rb; - - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base); - return rb->rg_owner; + return rqst->rq_xprtdata; } /* @@ -356,15 +396,6 @@ struct rpcrdma_create_data_internal { unsigned int padding; /* non-rdma write header padding */ }; -#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \ - (rpcx_to_rdmad(rq->rq_xprt).inline_rsize) - -#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\ - (rpcx_to_rdmad(rq->rq_xprt).inline_wsize) - -#define RPCRDMA_INLINE_PAD_VALUE(rq)\ - rpcx_to_rdmad(rq->rq_xprt).padding - /* * Statistics for RPCRDMA */ @@ -386,6 +417,7 @@ struct rpcrdma_stats { unsigned long mrs_recovered; unsigned long mrs_orphaned; unsigned long mrs_allocated; + unsigned long local_inv_needed; }; /* @@ -409,6 +441,7 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mw *); void (*ro_release_mr)(struct rpcrdma_mw *); const char *ro_displayname; + const int ro_send_w_inv_ok; }; extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; @@ -457,19 +490,19 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_conn_func(struct rpcrdma_ep *ep); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); -int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, - struct rpcrdma_rep *); +int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *); /* * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); -void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *); +void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); @@ -482,10 +515,24 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); -struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, - size_t, gfp_t); -void rpcrdma_free_regbuf(struct rpcrdma_ia *, - struct rpcrdma_regbuf *); +struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, + gfp_t); +bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); +void rpcrdma_free_regbuf(struct rpcrdma_regbuf *); + +static inline bool +rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb) +{ + return rb->rg_device != NULL; +} + +static inline bool +rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +{ + if (likely(rpcrdma_regbuf_is_mapped(rb))) + return true; + return __rpcrdma_dma_map_regbuf(ia, rb); +} int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); @@ -503,25 +550,30 @@ rpcrdma_data_dir(bool writing) } /* - * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c - */ -void rpcrdma_connect_worker(struct work_struct *); -void rpcrdma_conn_func(struct rpcrdma_ep *); -void rpcrdma_reply_handler(struct rpcrdma_rep *); - -/* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ + +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + +bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, + u32, struct xdr_buf *, enum rpcrdma_chunktype); +void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); int rpcrdma_marshal_req(struct rpc_rqst *); -void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *, - struct rpcrdma_create_data_internal *, - unsigned int); +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_reply_handler(struct work_struct *work); /* RPC/RDMA module init - xprtrdma/transport.c */ extern unsigned int xprt_rdma_max_inline_read; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); +void rpcrdma_connect_worker(struct work_struct *work); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bf168838a029..af392d9b9cec 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -473,7 +473,16 @@ static int xs_nospace(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); /* Race breaker in case memory is freed before above code is called */ - sk->sk_write_space(sk); + if (ret == -EAGAIN) { + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags); + rcu_read_unlock(); + + sk->sk_write_space(sk); + } return ret; } @@ -1071,10 +1080,10 @@ static void xs_udp_data_receive(struct sock_xprt *transport) if (sk == NULL) goto out; for (;;) { - skb = skb_recv_datagram(sk, 0, 1, &err); + skb = skb_recv_udp(sk, 0, 1, &err); if (skb != NULL) { xs_udp_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram_locked(sk, skb); + consume_skb(skb); continue; } if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) @@ -2533,35 +2542,39 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) * we allocate pages instead doing a kmalloc like rpc_malloc is because we want * to use the server side send routines. */ -static void *bc_malloc(struct rpc_task *task, size_t size) +static int bc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize; struct page *page; struct rpc_buffer *buf; - WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer)); - if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) - return NULL; + if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) { + WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n", + size); + return -EINVAL; + } page = alloc_page(GFP_KERNEL); if (!page) - return NULL; + return -ENOMEM; buf = page_address(page); buf->len = PAGE_SIZE; - return buf->data; + rqst->rq_buffer = buf->data; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; + return 0; } /* * Free the space allocated in the bc_alloc routine */ -static void bc_free(void *buffer) +static void bc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); free_page((unsigned long)buf); } |