summaryrefslogtreecommitdiff
path: root/net/sunrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/auth.c2
-rw-r--r--net/sunrpc/auth_generic.c13
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c29
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c94
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c3
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c4
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c27
-rw-r--r--net/sunrpc/auth_unix.c13
-rw-r--r--net/sunrpc/backchannel_rqst.c8
-rw-r--r--net/sunrpc/cache.c9
-rw-r--r--net/sunrpc/clnt.c142
-rw-r--r--net/sunrpc/netns.h2
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rw-r--r--net/sunrpc/sched.c35
-rw-r--r--net/sunrpc/stats.c10
-rw-r--r--net/sunrpc/sunrpc_syms.c3
-rw-r--r--net/sunrpc/svc.c31
-rw-r--r--net/sunrpc/svc_xprt.c25
-rw-r--r--net/sunrpc/svcauth.c18
-rw-r--r--net/sunrpc/svcauth_unix.c6
-rw-r--r--net/sunrpc/svcsock.c47
-rw-r--r--net/sunrpc/sysctl.c2
-rw-r--r--net/sunrpc/xdr.c11
-rw-r--r--net/sunrpc/xprt.c5
-rw-r--r--net/sunrpc/xprtmultipath.c24
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c55
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c7
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c145
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c359
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c36
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c25
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c156
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c156
-rw-r--r--net/sunrpc/xprtrdma/transport.c234
-rw-r--r--net/sunrpc/xprtrdma/verbs.c268
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h140
-rw-r--r--net/sunrpc/xprtsock.c39
37 files changed, 1308 insertions, 877 deletions
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index a7e42f9a405c..2bff63a73cf8 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -551,7 +551,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
*entry, *new;
unsigned int nr;
- nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits);
+ nr = auth->au_ops->hash_cred(acred, cache->hashbits);
rcu_read_lock();
hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 168219535a34..f1df9837f1ac 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -78,6 +78,14 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
return auth->au_ops->lookup_cred(auth, acred, lookupflags);
}
+static int
+generic_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+ return hash_64(from_kgid(&init_user_ns, acred->gid) |
+ ((u64)from_kuid(&init_user_ns, acred->uid) <<
+ (sizeof(gid_t) * 8)), hashbits);
+}
+
/*
* Lookup generic creds for current process
*/
@@ -176,8 +184,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
if (gcred->acred.group_info->ngroups != acred->group_info->ngroups)
goto out_nomatch;
for (i = 0; i < gcred->acred.group_info->ngroups; i++) {
- if (!gid_eq(GROUP_AT(gcred->acred.group_info, i),
- GROUP_AT(acred->group_info, i)))
+ if (!gid_eq(gcred->acred.group_info->gid[i],
+ acred->group_info->gid[i]))
goto out_nomatch;
}
out_match:
@@ -258,6 +266,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
static const struct rpc_authops generic_auth_ops = {
.owner = THIS_MODULE,
.au_name = "Generic",
+ .hash_cred = generic_hash_cred,
.lookup_cred = generic_lookup_cred,
.crcreate = generic_create_cred,
.key_timeout = generic_key_timeout,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 976c7812bbd5..cdeb1d814833 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -50,7 +50,7 @@
#include <linux/workqueue.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/sunrpc/gss_api.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/hashtable.h>
#include "../netns.h"
@@ -541,9 +541,13 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred)
return gss_new;
gss_msg = gss_add_msg(gss_new);
if (gss_msg == gss_new) {
- int res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg);
+ int res;
+ atomic_inc(&gss_msg->count);
+ res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg);
if (res) {
gss_unhash_msg(gss_new);
+ atomic_dec(&gss_msg->count);
+ gss_release_msg(gss_new);
gss_msg = ERR_PTR(res);
}
} else
@@ -836,6 +840,7 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
warn_gssd();
gss_release_msg(gss_msg);
}
+ gss_release_msg(gss_msg);
}
static void gss_pipe_dentry_destroy(struct dentry *dir,
@@ -1298,6 +1303,12 @@ gss_destroy_cred(struct rpc_cred *cred)
gss_destroy_nullcred(cred);
}
+static int
+gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+ return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits);
+}
+
/*
* Lookup RPCSEC_GSS cred for the current process
*/
@@ -1610,7 +1621,7 @@ gss_validate(struct rpc_task *task, __be32 *p)
{
struct rpc_cred *cred = task->tk_rqstp->rq_cred;
struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
- __be32 seq;
+ __be32 *seq = NULL;
struct kvec iov;
struct xdr_buf verf_buf;
struct xdr_netobj mic;
@@ -1625,9 +1636,12 @@ gss_validate(struct rpc_task *task, __be32 *p)
goto out_bad;
if (flav != RPC_AUTH_GSS)
goto out_bad;
- seq = htonl(task->tk_rqstp->rq_seqno);
- iov.iov_base = &seq;
- iov.iov_len = sizeof(seq);
+ seq = kmalloc(4, GFP_NOFS);
+ if (!seq)
+ goto out_bad;
+ *seq = htonl(task->tk_rqstp->rq_seqno);
+ iov.iov_base = seq;
+ iov.iov_len = 4;
xdr_buf_from_iov(&iov, &verf_buf);
mic.data = (u8 *)p;
mic.len = len;
@@ -1647,11 +1661,13 @@ gss_validate(struct rpc_task *task, __be32 *p)
gss_put_ctx(ctx);
dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n",
task->tk_pid, __func__);
+ kfree(seq);
return p + XDR_QUADLEN(len);
out_bad:
gss_put_ctx(ctx);
dprintk("RPC: %5u %s failed ret %ld.\n", task->tk_pid, __func__,
PTR_ERR(ret));
+ kfree(seq);
return ret;
}
@@ -1982,6 +1998,7 @@ static const struct rpc_authops authgss_ops = {
.au_name = "RPCSEC_GSS",
.create = gss_create,
.destroy = gss_destroy,
+ .hash_cred = gss_hash_cred,
.lookup_cred = gss_lookup_cred,
.crcreate = gss_create_cred,
.list_pseudoflavors = gss_mech_list_pseudoflavors,
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 244245bcbbd2..fb39284ec174 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -166,8 +166,8 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
unsigned int usage, struct xdr_netobj *cksumout)
{
struct scatterlist sg[1];
- int err;
- u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ int err = -1;
+ u8 *checksumdata;
u8 rc4salt[4];
struct crypto_ahash *md5;
struct crypto_ahash *hmac_md5;
@@ -187,23 +187,22 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
+ checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+ if (!checksumdata)
+ return GSS_S_FAILURE;
+
md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(md5))
- return GSS_S_FAILURE;
+ goto out_free_cksum;
hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0,
CRYPTO_ALG_ASYNC);
- if (IS_ERR(hmac_md5)) {
- crypto_free_ahash(md5);
- return GSS_S_FAILURE;
- }
+ if (IS_ERR(hmac_md5))
+ goto out_free_md5;
- req = ahash_request_alloc(md5, GFP_KERNEL);
- if (!req) {
- crypto_free_ahash(hmac_md5);
- crypto_free_ahash(md5);
- return GSS_S_FAILURE;
- }
+ req = ahash_request_alloc(md5, GFP_NOFS);
+ if (!req)
+ goto out_free_hmac_md5;
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
@@ -231,12 +230,9 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
goto out;
ahash_request_free(req);
- req = ahash_request_alloc(hmac_md5, GFP_KERNEL);
- if (!req) {
- crypto_free_ahash(hmac_md5);
- crypto_free_ahash(md5);
- return GSS_S_FAILURE;
- }
+ req = ahash_request_alloc(hmac_md5, GFP_NOFS);
+ if (!req)
+ goto out_free_hmac_md5;
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
@@ -258,8 +254,12 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
cksumout->len = kctx->gk5e->cksumlength;
out:
ahash_request_free(req);
- crypto_free_ahash(md5);
+out_free_hmac_md5:
crypto_free_ahash(hmac_md5);
+out_free_md5:
+ crypto_free_ahash(md5);
+out_free_cksum:
+ kfree(checksumdata);
return err ? GSS_S_FAILURE : 0;
}
@@ -276,8 +276,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
struct crypto_ahash *tfm;
struct ahash_request *req;
struct scatterlist sg[1];
- int err;
- u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ int err = -1;
+ u8 *checksumdata;
unsigned int checksumlen;
if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR)
@@ -291,15 +291,17 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
+ checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+ if (checksumdata == NULL)
+ return GSS_S_FAILURE;
+
tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm))
- return GSS_S_FAILURE;
+ goto out_free_cksum;
- req = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- crypto_free_ahash(tfm);
- return GSS_S_FAILURE;
- }
+ req = ahash_request_alloc(tfm, GFP_NOFS);
+ if (!req)
+ goto out_free_ahash;
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
@@ -349,7 +351,10 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
cksumout->len = kctx->gk5e->cksumlength;
out:
ahash_request_free(req);
+out_free_ahash:
crypto_free_ahash(tfm);
+out_free_cksum:
+ kfree(checksumdata);
return err ? GSS_S_FAILURE : 0;
}
@@ -368,8 +373,8 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
struct crypto_ahash *tfm;
struct ahash_request *req;
struct scatterlist sg[1];
- int err;
- u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ int err = -1;
+ u8 *checksumdata;
unsigned int checksumlen;
if (kctx->gk5e->keyed_cksum == 0) {
@@ -383,16 +388,18 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
+ checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+ if (!checksumdata)
+ return GSS_S_FAILURE;
+
tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm))
- return GSS_S_FAILURE;
+ goto out_free_cksum;
checksumlen = crypto_ahash_digestsize(tfm);
- req = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- crypto_free_ahash(tfm);
- return GSS_S_FAILURE;
- }
+ req = ahash_request_alloc(tfm, GFP_NOFS);
+ if (!req)
+ goto out_free_ahash;
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
@@ -433,7 +440,10 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
}
out:
ahash_request_free(req);
+out_free_ahash:
crypto_free_ahash(tfm);
+out_free_cksum:
+ kfree(checksumdata);
return err ? GSS_S_FAILURE : 0;
}
@@ -666,14 +676,17 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf,
u32 ret;
struct scatterlist sg[1];
SKCIPHER_REQUEST_ON_STACK(req, cipher);
- u8 data[GSS_KRB5_MAX_BLOCKSIZE * 2];
+ u8 *data;
struct page **save_pages;
u32 len = buf->len - offset;
- if (len > ARRAY_SIZE(data)) {
+ if (len > GSS_KRB5_MAX_BLOCKSIZE * 2) {
WARN_ON(0);
return -ENOMEM;
}
+ data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
/*
* For encryption, we want to read from the cleartext
@@ -708,6 +721,7 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf,
ret = write_bytes_to_xdr_buf(buf, offset, data, len);
out:
+ kfree(data);
return ret;
}
@@ -949,7 +963,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
}
desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_KERNEL);
+ GFP_NOFS);
if (!desc) {
dprintk("%s: failed to allocate shash descriptor for '%s'\n",
__func__, kctx->gk5e->cksum_name);
@@ -1016,7 +1030,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
}
desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_KERNEL);
+ GFP_NOFS);
if (!desc) {
dprintk("%s: failed to allocate shash descriptor for '%s'\n",
__func__, kctx->gk5e->cksum_name);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 60595835317a..7bb2514aadd9 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -451,8 +451,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
goto out_err_free_hmac;
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_KERNEL);
+ desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS);
if (!desc) {
dprintk("%s: failed to allocate hash descriptor for '%s'\n",
__func__, ctx->gk5e->cksum_name);
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index eeeba5adee6d..25d9a9cf7b66 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -229,7 +229,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr,
kgid = make_kgid(&init_user_ns, tmp);
if (!gid_valid(kgid))
goto out_free_groups;
- GROUP_AT(creds->cr_group_info, i) = kgid;
+ creds->cr_group_info->gid[i] = kgid;
}
return 0;
@@ -260,7 +260,7 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
if (!oa->data)
return -ENOMEM;
- creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL);
+ creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL);
if (!creds) {
kfree(oa->data);
return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index d8582028b346..153082598522 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -479,7 +479,7 @@ static int rsc_parse(struct cache_detail *cd,
kgid = make_kgid(&init_user_ns, id);
if (!gid_valid(kgid))
goto out;
- GROUP_AT(rsci.cred.cr_group_info, i) = kgid;
+ rsci.cred.cr_group_info->gid[i] = kgid;
}
/* mech name */
@@ -718,30 +718,37 @@ gss_write_null_verf(struct svc_rqst *rqstp)
static int
gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq)
{
- __be32 xdr_seq;
+ __be32 *xdr_seq;
u32 maj_stat;
struct xdr_buf verf_data;
struct xdr_netobj mic;
__be32 *p;
struct kvec iov;
+ int err = -1;
svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS);
- xdr_seq = htonl(seq);
+ xdr_seq = kmalloc(4, GFP_KERNEL);
+ if (!xdr_seq)
+ return -1;
+ *xdr_seq = htonl(seq);
- iov.iov_base = &xdr_seq;
- iov.iov_len = sizeof(xdr_seq);
+ iov.iov_base = xdr_seq;
+ iov.iov_len = 4;
xdr_buf_from_iov(&iov, &verf_data);
p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len;
mic.data = (u8 *)(p + 1);
maj_stat = gss_get_mic(ctx_id, &verf_data, &mic);
if (maj_stat != GSS_S_COMPLETE)
- return -1;
+ goto out;
*p++ = htonl(mic.len);
memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len);
p += XDR_QUADLEN(mic.len);
if (!xdr_ressize_check(rqstp, p))
- return -1;
- return 0;
+ goto out;
+ err = 0;
+out:
+ kfree(xdr_seq);
+ return err;
}
struct gss_domain {
@@ -1482,7 +1489,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
case RPC_GSS_PROC_DESTROY:
if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
goto auth_err;
- rsci->h.expiry_time = get_seconds();
+ rsci->h.expiry_time = seconds_since_boot();
set_bit(CACHE_NEGATIVE, &rsci->h.flags);
if (resv->iov_len + 4 > PAGE_SIZE)
goto drop;
@@ -1541,7 +1548,7 @@ complete:
ret = SVC_COMPLETE;
goto out;
drop:
- ret = SVC_DROP;
+ ret = SVC_CLOSE;
out:
if (rsci)
cache_put(&rsci->h, sn->rsc_cache);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index a99278c984e8..306fc0f54596 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -46,6 +46,14 @@ unx_destroy(struct rpc_auth *auth)
rpcauth_clear_credcache(auth->au_credcache);
}
+static int
+unx_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+ return hash_64(from_kgid(&init_user_ns, acred->gid) |
+ ((u64)from_kuid(&init_user_ns, acred->uid) <<
+ (sizeof(gid_t) * 8)), hashbits);
+}
+
/*
* Lookup AUTH_UNIX creds for current process
*/
@@ -79,7 +87,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
cred->uc_gid = acred->gid;
for (i = 0; i < groups; i++)
- cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
+ cred->uc_gids[i] = acred->group_info->gid[i];
if (i < NFS_NGROUPS)
cred->uc_gids[i] = INVALID_GID;
@@ -127,7 +135,7 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
if (groups > NFS_NGROUPS)
groups = NFS_NGROUPS;
for (i = 0; i < groups ; i++)
- if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i)))
+ if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
return 0;
if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
return 0;
@@ -220,6 +228,7 @@ const struct rpc_authops authunix_ops = {
.au_name = "UNIX",
.create = unx_create,
.destroy = unx_destroy,
+ .hash_cred = unx_hash_cred,
.lookup_cred = unx_lookup_cred,
.crcreate = unx_create_cred,
};
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 229956bf8457..ac701c28f44f 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
page = alloc_page(gfp_flags);
if (page == NULL)
return -ENOMEM;
- buf->head[0].iov_base = page_address(page);
- buf->head[0].iov_len = PAGE_SIZE;
- buf->tail[0].iov_base = NULL;
- buf->tail[0].iov_len = 0;
- buf->page_len = 0;
- buf->len = 0;
- buf->buflen = PAGE_SIZE;
+ xdr_buf_init(buf, page_address(page), PAGE_SIZE);
return 0;
}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 4d8e11f94a35..f39e3e11f9aa 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -21,7 +21,7 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/string_helpers.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
@@ -353,7 +353,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd)
spin_unlock(&cache_list_lock);
/* start the cleaning process */
- schedule_delayed_work(&cache_cleaner, 0);
+ queue_delayed_work(system_power_efficient_wq, &cache_cleaner, 0);
}
EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail);
@@ -476,7 +476,8 @@ static void do_cache_clean(struct work_struct *work)
delay = 0;
if (delay)
- schedule_delayed_work(&cache_cleaner, delay);
+ queue_delayed_work(system_power_efficient_wq,
+ &cache_cleaner, delay);
}
@@ -1357,7 +1358,7 @@ static int c_show(struct seq_file *m, void *p)
ifdebug(CACHE)
seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
convert_to_wallclock(cp->expiry_time),
- atomic_read(&cp->ref.refcount), cp->flags);
+ kref_read(&cp->ref), cp->flags);
cache_get(cp);
if (cache_check(cd, cp, NULL))
/* cache_check does a cache_put on failure */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 66f23b376fa0..1dc9f3bac099 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -184,7 +184,6 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
struct super_block *sb)
{
struct dentry *dentry;
- int err = 0;
switch (event) {
case RPC_PIPEFS_MOUNT:
@@ -201,7 +200,7 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event);
return -ENOTSUPP;
}
- return err;
+ return 0;
}
static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event,
@@ -337,6 +336,11 @@ out:
static DEFINE_IDA(rpc_clids);
+void rpc_cleanup_clids(void)
+{
+ ida_destroy(&rpc_clids);
+}
+
static int rpc_alloc_clid(struct rpc_clnt *clnt)
{
int clid;
@@ -988,7 +992,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
{
if (clnt != NULL) {
- rpc_task_release_client(task);
if (task->tk_xprt == NULL)
task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
task->tk_client = clnt;
@@ -1693,6 +1696,7 @@ call_allocate(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
+ int status;
dprint_status(task);
@@ -1718,11 +1722,14 @@ call_allocate(struct rpc_task *task)
req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen;
req->rq_rcvsize <<= 2;
- req->rq_buffer = xprt->ops->buf_alloc(task,
- req->rq_callsize + req->rq_rcvsize);
- if (req->rq_buffer != NULL)
- return;
+ status = xprt->ops->buf_alloc(task);
xprt_inject_disconnect(xprt);
+ if (status == 0)
+ return;
+ if (status != -ENOMEM) {
+ rpc_exit(task, status);
+ return;
+ }
dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
@@ -1748,18 +1755,6 @@ rpc_task_force_reencode(struct rpc_task *task)
task->tk_rqstp->rq_bytes_sent = 0;
}
-static inline void
-rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
-{
- buf->head[0].iov_base = start;
- buf->head[0].iov_len = len;
- buf->tail[0].iov_len = 0;
- buf->page_len = 0;
- buf->flags = 0;
- buf->len = 0;
- buf->buflen = len;
-}
-
/*
* 3. Encode arguments of an RPC call
*/
@@ -1772,12 +1767,12 @@ rpc_xdr_encode(struct rpc_task *task)
dprint_status(task);
- rpc_xdr_buf_init(&req->rq_snd_buf,
- req->rq_buffer,
- req->rq_callsize);
- rpc_xdr_buf_init(&req->rq_rcv_buf,
- (char *)req->rq_buffer + req->rq_callsize,
- req->rq_rcvsize);
+ xdr_buf_init(&req->rq_snd_buf,
+ req->rq_buffer,
+ req->rq_callsize);
+ xdr_buf_init(&req->rq_rcv_buf,
+ req->rq_rbuffer,
+ req->rq_rcvsize);
p = rpc_encode_header(task);
if (p == NULL) {
@@ -1936,6 +1931,8 @@ call_connect_status(struct rpc_task *task)
case -EADDRINUSE:
case -ENOBUFS:
case -EPIPE:
+ xprt_conditional_disconnect(task->tk_rqstp->rq_xprt,
+ task->tk_rqstp->rq_connect_cookie);
if (RPC_IS_SOFTCONN(task))
break;
/* retry with existing socket, after a delay */
@@ -2616,6 +2613,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
/**
+ * rpc_clnt_setup_test_and_add_xprt()
+ *
+ * This is an rpc_clnt_add_xprt setup() function which returns 1 so:
+ * 1) caller of the test function must dereference the rpc_xprt_switch
+ * and the rpc_xprt.
+ * 2) test function must call rpc_xprt_switch_add_xprt, usually in
+ * the rpc_call_done routine.
+ *
+ * Upon success (return of 1), the test function adds the new
+ * transport to the rpc_clnt xprt switch
+ *
+ * @clnt: struct rpc_clnt to get the new transport
+ * @xps: the rpc_xprt_switch to hold the new transport
+ * @xprt: the rpc_xprt to test
+ * @data: a struct rpc_add_xprt_test pointer that holds the test function
+ * and test function call data
+ */
+int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt,
+ struct rpc_xprt_switch *xps,
+ struct rpc_xprt *xprt,
+ void *data)
+{
+ struct rpc_cred *cred;
+ struct rpc_task *task;
+ struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data;
+ int status = -EADDRINUSE;
+
+ xprt = xprt_get(xprt);
+ xprt_switch_get(xps);
+
+ if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+ goto out_err;
+
+ /* Test the connection */
+ cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+ task = rpc_call_null_helper(clnt, xprt, cred,
+ RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+ NULL, NULL);
+ put_rpccred(cred);
+ if (IS_ERR(task)) {
+ status = PTR_ERR(task);
+ goto out_err;
+ }
+ status = task->tk_status;
+ rpc_put_task(task);
+
+ if (status < 0)
+ goto out_err;
+
+ /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */
+ xtest->add_xprt_test(clnt, xprt, xtest->data);
+
+ /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */
+ return 1;
+out_err:
+ xprt_put(xprt);
+ xprt_switch_put(xps);
+ pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n",
+ status, xprt->address_strings[RPC_DISPLAY_ADDR]);
+ return status;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt);
+
+/**
* rpc_clnt_add_xprt - Add a new transport to a rpc_clnt
* @clnt: pointer to struct rpc_clnt
* @xprtargs: pointer to struct xprt_create
@@ -2697,6 +2758,37 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
}
EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
+void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
+{
+ rcu_read_lock();
+ xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
+
+void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+ rcu_read_lock();
+ rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
+ xprt);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt);
+
+bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
+ const struct sockaddr *sap)
+{
+ struct rpc_xprt_switch *xps;
+ bool ret;
+
+ rcu_read_lock();
+ xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+ ret = rpc_xprt_switch_has_addr(xps, sap);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr);
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static void rpc_show_header(void)
{
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index df5826876535..394ce523174c 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -34,7 +34,7 @@ struct sunrpc_net {
struct proc_dir_entry *use_gssp_proc;
};
-extern int sunrpc_net_id;
+extern unsigned int sunrpc_net_id;
int ip_map_cache_create(struct net *);
void ip_map_cache_destroy(struct net *);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 84f98cbe31c3..61a504fb1ae2 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -477,7 +477,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
return NULL;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_fop = &simple_dir_operations;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9ae588511aaf..5db68b371db2 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work)
}
/**
- * rpc_malloc - allocate an RPC buffer
- * @task: RPC task that will use this buffer
- * @size: requested byte size
+ * rpc_malloc - allocate RPC buffer resources
+ * @task: RPC task
+ *
+ * A single memory region is allocated, which is split between the
+ * RPC call and RPC reply that this task is being used for. When
+ * this RPC is retired, the memory is released by calling rpc_free.
*
* To prevent rpciod from hanging, this allocator never sleeps,
- * returning NULL and suppressing warning if the request cannot be serviced
- * immediately.
- * The caller can arrange to sleep in a way that is safe for rpciod.
+ * returning -ENOMEM and suppressing warning if the request cannot
+ * be serviced immediately. The caller can arrange to sleep in a
+ * way that is safe for rpciod.
*
* Most requests are 'small' (under 2KiB) and can be serviced from a
* mempool, ensuring that NFS reads and writes can always proceed,
@@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work)
* In order to avoid memory starvation triggering more writebacks of
* NFS requests, we avoid using GFP_KERNEL.
*/
-void *rpc_malloc(struct rpc_task *task, size_t size)
+int rpc_malloc(struct rpc_task *task)
{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
struct rpc_buffer *buf;
gfp_t gfp = GFP_NOIO | __GFP_NOWARN;
@@ -880,28 +885,28 @@ void *rpc_malloc(struct rpc_task *task, size_t size)
buf = kmalloc(size, gfp);
if (!buf)
- return NULL;
+ return -ENOMEM;
buf->len = size;
dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
task->tk_pid, size, buf);
- return &buf->data;
+ rqst->rq_buffer = buf->data;
+ rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
+ return 0;
}
EXPORT_SYMBOL_GPL(rpc_malloc);
/**
- * rpc_free - free buffer allocated via rpc_malloc
- * @buffer: buffer to free
+ * rpc_free - free RPC buffer resources allocated via rpc_malloc
+ * @task: RPC task
*
*/
-void rpc_free(void *buffer)
+void rpc_free(struct rpc_task *task)
{
+ void *buffer = task->tk_rqstp->rq_buffer;
size_t size;
struct rpc_buffer *buf;
- if (!buffer)
- return;
-
buf = container_of(buffer, struct rpc_buffer, data);
size = buf->len;
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 2ecb994314c1..caeb01ad2b5a 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -157,15 +157,17 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
spin_lock(&op_metrics->om_lock);
op_metrics->om_ops++;
- op_metrics->om_ntrans += req->rq_ntrans;
+ /* kernel API: om_ops must never become larger than om_ntrans */
+ op_metrics->om_ntrans += max(req->rq_ntrans, 1);
op_metrics->om_timeouts += task->tk_timeouts;
op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent;
op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
- delta = ktime_sub(req->rq_xtime, task->tk_start);
- op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
-
+ if (ktime_to_ns(req->rq_xtime)) {
+ delta = ktime_sub(req->rq_xtime, task->tk_start);
+ op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
+ }
op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);
delta = ktime_sub(now, task->tk_start);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index ee5d3d253102..c73de181467a 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -24,7 +24,7 @@
#include "netns.h"
-int sunrpc_net_id;
+unsigned int sunrpc_net_id;
EXPORT_SYMBOL_GPL(sunrpc_net_id);
static __net_init int sunrpc_init_net(struct net *net)
@@ -119,6 +119,7 @@ out:
static void __exit
cleanup_sunrpc(void)
{
+ rpc_cleanup_clids();
rpcauth_remove_module();
cleanup_socket_xprt();
svc_cleanup_xprt_sock();
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c5b0cb4f4056..75f290bddca1 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -401,6 +401,21 @@ int svc_bind(struct svc_serv *serv, struct net *net)
}
EXPORT_SYMBOL_GPL(svc_bind);
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+ INIT_LIST_HEAD(&serv->sv_cb_list);
+ spin_lock_init(&serv->sv_cb_lock);
+ init_waitqueue_head(&serv->sv_cb_waitq);
+}
+#else
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+}
+#endif
+
/*
* Create an RPC service
*/
@@ -443,6 +458,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
init_timer(&serv->sv_temptimer);
spin_lock_init(&serv->sv_lock);
+ __svc_init_bc(serv);
+
serv->sv_nrpools = npools;
serv->sv_pools =
kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
@@ -1138,8 +1155,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
case SVC_DENIED:
goto err_bad_auth;
case SVC_CLOSE:
- if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
- svc_close_xprt(rqstp->rq_xprt);
+ goto close;
case SVC_DROP:
goto dropit;
case SVC_COMPLETE:
@@ -1229,7 +1245,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
sendit:
if (svc_authorise(rqstp))
- goto dropit;
+ goto close;
return 1; /* Caller can now send it */
dropit:
@@ -1237,11 +1253,16 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
dprintk("svc: svc_process dropit\n");
return 0;
+ close:
+ if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+ svc_close_xprt(rqstp->rq_xprt);
+ dprintk("svc: svc_process close\n");
+ return 0;
+
err_short_len:
svc_printk(rqstp, "short len %Zd, dropping request\n",
argv->iov_len);
-
- goto dropit; /* drop request */
+ goto close;
err_bad_rpc:
serv->sv_stats->rpcbadfmt++;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c3f652395a80..7bfe1fb42add 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
svc_xprt_get(xprt);
dprintk("svc: transport %p dequeued, inuse=%d\n",
- xprt, atomic_read(&xprt->xpt_ref.refcount));
+ xprt, kref_read(&xprt->xpt_ref));
}
spin_unlock_bh(&pool->sp_lock);
out:
@@ -799,6 +799,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
dprintk("svc_recv: found XPT_CLOSE\n");
+ if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags))
+ xprt->xpt_ops->xpo_kill_temp_xprt(xprt);
svc_delete_xprt(xprt);
/* Leave XPT_BUSY set on the dead xprt: */
goto out;
@@ -820,7 +822,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
rqstp, rqstp->rq_pool->sp_id, xprt,
- atomic_read(&xprt->xpt_ref.refcount));
+ kref_read(&xprt->xpt_ref));
rqstp->rq_deferred = svc_deferred_dequeue(xprt);
if (rqstp->rq_deferred)
len = svc_deferred_recv(rqstp);
@@ -978,7 +980,7 @@ static void svc_age_temp_xprts(unsigned long closure)
* through, close it. */
if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
continue;
- if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
+ if (kref_read(&xprt->xpt_ref) > 1 ||
test_bit(XPT_BUSY, &xprt->xpt_flags))
continue;
list_del_init(le);
@@ -1002,14 +1004,8 @@ static void svc_age_temp_xprts(unsigned long closure)
void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
{
struct svc_xprt *xprt;
- struct svc_sock *svsk;
- struct socket *sock;
struct list_head *le, *next;
LIST_HEAD(to_be_closed);
- struct linger no_linger = {
- .l_onoff = 1,
- .l_linger = 0,
- };
spin_lock_bh(&serv->sv_lock);
list_for_each_safe(le, next, &serv->sv_tempsocks) {
@@ -1026,12 +1022,11 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
le = to_be_closed.next;
list_del_init(le);
xprt = list_entry(le, struct svc_xprt, xpt_list);
- dprintk("svc_age_temp_xprts_now: closing %p\n", xprt);
- svsk = container_of(xprt, struct svc_sock, sk_xprt);
- sock = svsk->sk_sock;
- kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
- (char *)&no_linger, sizeof(no_linger));
- svc_close_xprt(xprt);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ set_bit(XPT_KILL_TEMP, &xprt->xpt_flags);
+ dprintk("svc_age_temp_xprts_now: queuing xprt %p for closing\n",
+ xprt);
+ svc_xprt_enqueue(xprt);
}
}
EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 69841db1f533..bb8db3cb8032 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -124,16 +124,20 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister);
#define DN_HASHMAX (1<<DN_HASHBITS)
static struct hlist_head auth_domain_table[DN_HASHMAX];
-static spinlock_t auth_domain_lock =
- __SPIN_LOCK_UNLOCKED(auth_domain_lock);
+static DEFINE_SPINLOCK(auth_domain_lock);
+
+static void auth_domain_release(struct kref *kref)
+{
+ struct auth_domain *dom = container_of(kref, struct auth_domain, ref);
+
+ hlist_del(&dom->hash);
+ dom->flavour->domain_release(dom);
+ spin_unlock(&auth_domain_lock);
+}
void auth_domain_put(struct auth_domain *dom)
{
- if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
- hlist_del(&dom->hash);
- dom->flavour->domain_release(dom);
- spin_unlock(&auth_domain_lock);
- }
+ kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock);
}
EXPORT_SYMBOL_GPL(auth_domain_put);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index dfacdc95b3f5..64af4f034de6 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -517,7 +517,7 @@ static int unix_gid_parse(struct cache_detail *cd,
kgid = make_kgid(&init_user_ns, gid);
if (!gid_valid(kgid))
goto out;
- GROUP_AT(ug.gi, i) = kgid;
+ ug.gi->gid[i] = kgid;
}
ugp = unix_gid_lookup(cd, uid);
@@ -564,7 +564,7 @@ static int unix_gid_show(struct seq_file *m,
seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen);
for (i = 0; i < glen; i++)
- seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i)));
+ seq_printf(m, " %d", from_kgid_munged(user_ns, ug->gi->gid[i]));
seq_printf(m, "\n");
return 0;
}
@@ -817,7 +817,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
return SVC_CLOSE;
for (i = 0; i < slen; i++) {
kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
- GROUP_AT(cred->cr_group_info, i) = kgid;
+ cred->cr_group_info->gid[i] = kgid;
}
if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
*authp = rpc_autherr_badverf;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 57625f64efd5..de066acdb34e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -39,9 +39,10 @@
#include <net/checksum.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/udp.h>
#include <net/tcp.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <trace/events/skb.h>
@@ -129,6 +130,18 @@ static void svc_release_skb(struct svc_rqst *rqstp)
}
}
+static void svc_release_udp_skb(struct svc_rqst *rqstp)
+{
+ struct sk_buff *skb = rqstp->rq_xprt_ctxt;
+
+ if (skb) {
+ rqstp->rq_xprt_ctxt = NULL;
+
+ dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+ consume_skb(skb);
+ }
+}
+
union svc_pktinfo_u {
struct in_pktinfo pkti;
struct in6_pktinfo pkti6;
@@ -438,6 +451,21 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
}
+static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk;
+ struct socket *sock;
+ struct linger no_linger = {
+ .l_onoff = 1,
+ .l_linger = 0,
+ };
+
+ svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ sock = svsk->sk_sock;
+ kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+ (char *)&no_linger, sizeof(no_linger));
+}
+
/*
* See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
*/
@@ -534,7 +562,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
0, 0, MSG_PEEK | MSG_DONTWAIT);
if (err >= 0)
- skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
+ skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
if (skb == NULL) {
if (err != -EAGAIN) {
@@ -546,7 +574,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
}
len = svc_addr_len(svc_addr(rqstp));
rqstp->rq_addrlen = len;
- if (skb->tstamp.tv64 == 0) {
+ if (skb->tstamp == 0) {
skb->tstamp = ktime_get_real();
/* Don't enable netstamp, sunrpc doesn't
need that much accuracy */
@@ -575,7 +603,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
goto out_free;
}
local_bh_enable();
- skb_free_datagram_locked(svsk->sk_sk, skb);
+ consume_skb(skb);
} else {
/* we can use it in-place */
rqstp->rq_arg.head[0].iov_base = skb->data;
@@ -602,8 +630,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
return len;
out_free:
- trace_kfree_skb(skb, svc_udp_recvfrom);
- skb_free_datagram_locked(svsk->sk_sk, skb);
+ kfree_skb(skb);
return 0;
}
@@ -648,6 +675,10 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
return NULL;
}
+static void svc_udp_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
+
static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
@@ -660,13 +691,14 @@ static struct svc_xprt_ops svc_udp_ops = {
.xpo_create = svc_udp_create,
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
- .xpo_release_rqst = svc_release_skb,
+ .xpo_release_rqst = svc_release_udp_skb,
.xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free,
.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
.xpo_has_wspace = svc_udp_has_wspace,
.xpo_accept = svc_udp_accept,
.xpo_secure_port = svc_sock_secure_port,
+ .xpo_kill_temp_xprt = svc_udp_kill_temp_xprt,
};
static struct svc_xprt_class svc_udp_class = {
@@ -1242,6 +1274,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
.xpo_secure_port = svc_sock_secure_port,
+ .xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt,
};
static struct svc_xprt_class svc_tcp_class = {
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index c88d9bc06f5c..8c3936403fea 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -14,7 +14,7 @@
#include <linux/sysctl.h>
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/stats.h>
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c4f3cc0c0775..7f1071e103ca 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -767,7 +767,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
newbase -= xdr->buf->page_base;
if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+ xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
}
static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -776,7 +776,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr)
xdr_set_next_page(xdr);
else if (xdr->iov == xdr->buf->head) {
if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+ xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
}
return xdr->p != xdr->end;
}
@@ -859,12 +859,15 @@ EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
{
__be32 *p;
- void *cpdest = xdr->scratch.iov_base;
+ char *cpdest = xdr->scratch.iov_base;
size_t cplen = (char *)xdr->end - (char *)xdr->p;
if (nbytes > xdr->scratch.iov_len)
return NULL;
- memcpy(cpdest, xdr->p, cplen);
+ p = __xdr_inline_decode(xdr, cplen);
+ if (p == NULL)
+ return NULL;
+ memcpy(cpdest, p, cplen);
cpdest += cplen;
nbytes -= cplen;
if (!xdr_set_next_buffer(xdr))
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ea244b29138b..9a6be030ca7d 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -669,7 +669,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
spin_lock_bh(&xprt->transport_lock);
if (cookie != xprt->connect_cookie)
goto out;
- if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt))
+ if (test_bit(XPRT_CLOSING, &xprt->state))
goto out;
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
@@ -772,6 +772,7 @@ void xprt_connect(struct rpc_task *task)
if (!xprt_connected(xprt)) {
task->tk_rqstp->rq_bytes_sent = 0;
task->tk_timeout = task->tk_rqstp->rq_timeout;
+ task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
if (test_bit(XPRT_CLOSING, &xprt->state))
@@ -1295,7 +1296,7 @@ void xprt_release(struct rpc_task *task)
xprt_schedule_autodisconnect(xprt);
spin_unlock_bh(&xprt->transport_lock);
if (req->rq_buffer)
- xprt->ops->buf_free(req->rq_buffer);
+ xprt->ops->buf_free(task);
xprt_inject_disconnect(xprt);
if (req->rq_cred != NULL)
put_rpccred(req->rq_cred);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 66c9d63f4797..ae92a9e9ba52 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -15,6 +15,7 @@
#include <asm/cmpxchg.h>
#include <linux/spinlock.h>
#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/xprtmultipath.h>
typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head,
@@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
if (xprt == NULL)
return;
spin_lock(&xps->xps_lock);
- if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
+ if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
+ !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
xprt_switch_add_xprt_locked(xps, xprt);
spin_unlock(&xps->xps_lock);
}
@@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
}
+bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
+ const struct sockaddr *sap)
+{
+ struct list_head *head;
+ struct rpc_xprt *pos;
+
+ if (xps == NULL || sap == NULL)
+ return false;
+
+ head = &xps->xps_xprt_list;
+ list_for_each_entry_rcu(pos, head, xprt_switch) {
+ if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) {
+ pr_info("RPC: addr %s already in xprt switch\n",
+ pos->address_strings[RPC_DISPLAY_ADDR]);
+ return true;
+ }
+ }
+ return false;
+}
+
static
struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
const struct rpc_xprt *cur)
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 87762d976b63..24fedd4b117e 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -27,7 +27,7 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
list_del(&req->rl_all);
spin_unlock(&buf->rb_reqslock);
- rpcrdma_destroy_req(&r_xprt->rx_ia, req);
+ rpcrdma_destroy_req(req);
kfree(rqst);
}
@@ -35,10 +35,8 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_regbuf *rb;
struct rpcrdma_req *req;
- struct xdr_buf *buf;
size_t size;
req = rpcrdma_create_req(r_xprt);
@@ -46,30 +44,20 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
return PTR_ERR(req);
req->rl_backchannel = true;
- size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
- rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
+ DMA_TO_DEVICE, GFP_KERNEL);
if (IS_ERR(rb))
goto out_fail;
req->rl_rdmabuf = rb;
- size += RPCRDMA_INLINE_READ_THRESHOLD(rqst);
- rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ size = r_xprt->rx_data.inline_rsize;
+ rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
if (IS_ERR(rb))
goto out_fail;
- rb->rg_owner = req;
req->rl_sendbuf = rb;
- /* so that rpcr_to_rdmar works when receiving a request */
- rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base;
-
- buf = &rqst->rq_snd_buf;
- buf->head[0].iov_base = rqst->rq_buffer;
- buf->head[0].iov_len = 0;
- buf->tail[0].iov_base = NULL;
- buf->tail[0].iov_len = 0;
- buf->page_len = 0;
- buf->len = 0;
- buf->buflen = size;
-
+ xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
+ min_t(size_t, size, PAGE_SIZE));
+ rpcrdma_set_xprtdata(rqst, req);
return 0;
out_fail:
@@ -204,6 +192,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
size_t maxmsg;
maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+ maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
@@ -219,7 +208,6 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_msg *headerp;
- size_t rpclen;
headerp = rdmab_to_msg(req->rl_rdmabuf);
headerp->rm_xid = rqst->rq_xid;
@@ -231,26 +219,9 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
headerp->rm_body.rm_chunks[1] = xdr_zero;
headerp->rm_body.rm_chunks[2] = xdr_zero;
- rpclen = rqst->rq_svec[0].iov_len;
-
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
- pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
- __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
- pr_info("RPC: %s: RPC/RDMA: %*ph\n",
- __func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
- pr_info("RPC: %s: RPC: %*ph\n",
- __func__, (int)rpclen, rqst->rq_svec[0].iov_base);
-#endif
-
- req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
- req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
- req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
- req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
- req->rl_send_iov[1].length = rpclen;
- req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
- req->rl_niovs = 2;
+ if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN,
+ &rqst->rq_snd_buf, rpcrdma_noch))
+ return -EIO;
return 0;
}
@@ -402,7 +373,7 @@ out_overflow:
out_short:
pr_warn("RPC/RDMA short backward direction call\n");
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
xprt_disconnect_done(xprt);
else
pr_warn("RPC: %s: reposting rep %p\n",
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 21cb3b150b37..1ebb09e1ac4f 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -160,9 +160,8 @@ static int
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
- rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
- RPCRDMA_MAX_DATA_SEGS /
- RPCRDMA_MAX_FMR_SGES));
+ ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+ RPCRDMA_MAX_FMR_SGES);
return 0;
}
@@ -274,6 +273,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
list_for_each_entry(mw, &req->rl_registered, mw_list)
list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
+ r_xprt->rx_stats.local_inv_needed++;
rc = ib_unmap_fmr(&unmap_list);
if (rc)
goto out_reset;
@@ -331,4 +331,5 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_init_mr = fmr_op_init_mr,
.ro_release_mr = fmr_op_release_mr,
.ro_displayname = "fmr",
+ .ro_send_w_inv_ok = 0,
};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 892b5e1d9b09..47bed5333c7f 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -44,18 +44,20 @@
* being done.
*
* When the underlying transport disconnects, MRs are left in one of
- * three states:
+ * four states:
*
* INVALID: The MR was not in use before the QP entered ERROR state.
- * (Or, the LOCAL_INV WR has not completed or flushed yet).
- *
- * STALE: The MR was being registered or unregistered when the QP
- * entered ERROR state, and the pending WR was flushed.
*
* VALID: The MR was registered before the QP entered ERROR state.
*
- * When frwr_op_map encounters STALE and VALID MRs, they are recovered
- * with ib_dereg_mr and then are re-initialized. Beause MR recovery
+ * FLUSHED_FR: The MR was being registered when the QP entered ERROR
+ * state, and the pending WR was flushed.
+ *
+ * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR
+ * state, and the pending WR was flushed.
+ *
+ * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered
+ * with ib_dereg_mr and then are re-initialized. Because MR recovery
* allocates fresh resources, it is deferred to a workqueue, and the
* recovered MRs are placed back on the rb_mws list when recovery is
* complete. frwr_op_map allocates another MR for the current RPC while
@@ -67,6 +69,8 @@
* pending send queue WRs before the transport is reconnected.
*/
+#include <linux/sunrpc/rpc_rdma.h>
+
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -97,7 +101,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
struct rpcrdma_frmr *f = &r->frmr;
int rc;
- f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
if (IS_ERR(f->fr_mr))
goto out_mr_err;
@@ -153,7 +157,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return rc;
}
- f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
ia->ri_max_frmr_depth);
if (IS_ERR(f->fr_mr)) {
pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
@@ -161,26 +165,25 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return PTR_ERR(f->fr_mr);
}
- dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
+ dprintk("RPC: %s: recovered FRMR %p\n", __func__, f);
f->fr_state = FRMR_IS_INVALID;
return 0;
}
/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
- *
- * There's no recovery if this fails. The FRMR is abandoned, but
- * remains in rb_all. It will be cleaned up when the transport is
- * destroyed.
*/
static void
frwr_op_recover_mr(struct rpcrdma_mw *mw)
{
+ enum rpcrdma_frmr_state state = mw->frmr.fr_state;
struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int rc;
rc = __frwr_reset_mr(ia, mw);
- ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (state != FRMR_FLUSHED_LI)
+ ib_dma_unmap_sg(ia->ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
if (rc)
goto out_release;
@@ -203,11 +206,16 @@ static int
frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ struct ib_device_attr *attrs = &ia->ri_device->attrs;
int depth, delta;
+ ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+ if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+ ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+
ia->ri_max_frmr_depth =
min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- ia->ri_device->attrs.max_fast_reg_page_list_len);
+ attrs->max_fast_reg_page_list_len);
dprintk("RPC: %s: device's max FR page list len = %u\n",
__func__, ia->ri_max_frmr_depth);
@@ -234,17 +242,16 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
}
ep->rep_attr.cap.max_send_wr *= depth;
- if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
- cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
+ if (ep->rep_attr.cap.max_send_wr > attrs->max_qp_wr) {
+ cdata->max_requests = attrs->max_qp_wr / depth;
if (!cdata->max_requests)
return -EINVAL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests *
depth;
}
- rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
- RPCRDMA_MAX_DATA_SEGS /
- ia->ri_max_frmr_depth));
+ ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+ ia->ri_max_frmr_depth);
return 0;
}
@@ -261,10 +268,8 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
}
static void
-__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_frmr *frmr,
- const char *wr)
+__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr)
{
- frmr->fr_state = FRMR_IS_STALE;
if (wc->status != IB_WC_WR_FLUSH_ERR)
pr_err("rpcrdma: %s: %s (%u/0x%x)\n",
wr, ib_wc_status_msg(wc->status),
@@ -287,7 +292,8 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
if (wc->status != IB_WC_SUCCESS) {
cqe = wc->wr_cqe;
frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
- __frwr_sendcompletion_flush(wc, frmr, "fastreg");
+ frmr->fr_state = FRMR_FLUSHED_FR;
+ __frwr_sendcompletion_flush(wc, "fastreg");
}
}
@@ -307,7 +313,8 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
if (wc->status != IB_WC_SUCCESS) {
cqe = wc->wr_cqe;
frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
- __frwr_sendcompletion_flush(wc, frmr, "localinv");
+ frmr->fr_state = FRMR_FLUSHED_LI;
+ __frwr_sendcompletion_flush(wc, "localinv");
}
}
@@ -327,9 +334,11 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
cqe = wc->wr_cqe;
frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
- if (wc->status != IB_WC_SUCCESS)
- __frwr_sendcompletion_flush(wc, frmr, "localinv");
- complete_all(&frmr->fr_linv_done);
+ if (wc->status != IB_WC_SUCCESS) {
+ frmr->fr_state = FRMR_FLUSHED_LI;
+ __frwr_sendcompletion_flush(wc, "localinv");
+ }
+ complete(&frmr->fr_linv_done);
}
/* Post a REG_MR Work Request to register a memory region
@@ -340,6 +349,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, struct rpcrdma_mw **out)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
@@ -375,8 +385,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
++seg;
++i;
-
- /* Check for holes */
+ if (holes_ok)
+ continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
@@ -396,7 +406,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
goto out_mapmr_err;
dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
- __func__, mw, mw->mw_nents, mr->length);
+ __func__, frmr, mw->mw_nents, mr->length);
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
@@ -413,7 +423,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ;
- DECR_CQCOUNT(&r_xprt->rx_ep);
+ rpcrdma_set_signaled(&r_xprt->rx_ep, &reg_wr->wr);
rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
if (rc)
goto out_senderr;
@@ -443,24 +453,6 @@ out_senderr:
return -ENOTCONN;
}
-static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
-{
- struct rpcrdma_frmr *f = &mw->frmr;
- struct ib_send_wr *invalidate_wr;
-
- f->fr_state = FRMR_IS_INVALID;
- invalidate_wr = &f->fr_invwr;
-
- memset(invalidate_wr, 0, sizeof(*invalidate_wr));
- f->fr_cqe.done = frwr_wc_localinv;
- invalidate_wr->wr_cqe = &f->fr_cqe;
- invalidate_wr->opcode = IB_WR_LOCAL_INV;
- invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey;
-
- return invalidate_wr;
-}
-
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
@@ -471,11 +463,12 @@ __frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
static void
frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
- struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+ struct ib_send_wr *first, **prev, *last, *bad_wr;
+ struct rpcrdma_rep *rep = req->rl_reply;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_mw *mw, *tmp;
struct rpcrdma_frmr *f;
- int rc;
+ int count, rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -485,32 +478,53 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* a single ib_post_send() call.
*/
f = NULL;
- invalidate_wrs = pos = prev = NULL;
+ count = 0;
+ prev = &first;
list_for_each_entry(mw, &req->rl_registered, mw_list) {
- pos = __frwr_prepare_linv_wr(mw);
+ mw->frmr.fr_state = FRMR_IS_INVALID;
+
+ if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
+ (mw->mw_handle == rep->rr_inv_rkey))
+ continue;
- if (!invalidate_wrs)
- invalidate_wrs = pos;
- else
- prev->next = pos;
- prev = pos;
f = &mw->frmr;
+ dprintk("RPC: %s: invalidating frmr %p\n",
+ __func__, f);
+
+ f->fr_cqe.done = frwr_wc_localinv;
+ last = &f->fr_invwr;
+ memset(last, 0, sizeof(*last));
+ last->wr_cqe = &f->fr_cqe;
+ last->opcode = IB_WR_LOCAL_INV;
+ last->ex.invalidate_rkey = mw->mw_handle;
+ count++;
+
+ *prev = last;
+ prev = &last->next;
}
+ if (!f)
+ goto unmap;
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
* are complete.
*/
- f->fr_invwr.send_flags = IB_SEND_SIGNALED;
+ last->send_flags = IB_SEND_SIGNALED;
f->fr_cqe.done = frwr_wc_localinv_wake;
reinit_completion(&f->fr_linv_done);
- INIT_CQCOUNT(&r_xprt->rx_ep);
+
+ /* Initialize CQ count, since there is always a signaled
+ * WR being posted here. The new cqcount depends on how
+ * many SQEs are about to be consumed.
+ */
+ rpcrdma_init_cqcount(&r_xprt->rx_ep, count);
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
* unless ri_id->qp is a valid pointer.
*/
- rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
+ r_xprt->rx_stats.local_inv_needed++;
+ rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
if (rc)
goto reset_mrs;
@@ -521,6 +535,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
unmap:
list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ dprintk("RPC: %s: DMA unmapping frmr %p\n",
+ __func__, &mw->frmr);
list_del_init(&mw->mw_list);
ib_dma_unmap_sg(ia->ri_device,
mw->mw_sg, mw->mw_nents, mw->mw_dir);
@@ -537,7 +553,7 @@ reset_mrs:
*/
list_for_each_entry(mw, &req->rl_registered, mw_list) {
f = &mw->frmr;
- if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+ if (mw->mw_handle == bad_wr->ex.invalidate_rkey) {
__frwr_reset_mr(ia, mw);
bad_wr = bad_wr->next;
}
@@ -576,4 +592,5 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_init_mr = frwr_op_init_mr,
.ro_release_mr = frwr_op_release_mr,
.ro_displayname = "frwr",
+ .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK,
};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a47f170b20ef..c52e0f2ffe52 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,14 +53,6 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-enum rpcrdma_chunktype {
- rpcrdma_noch = 0,
- rpcrdma_readch,
- rpcrdma_areadch,
- rpcrdma_writech,
- rpcrdma_replych
-};
-
static const char transfertypes[][12] = {
"inline", /* no chunks */
"read list", /* some argument via rdma read */
@@ -118,10 +110,12 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
return size;
}
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
- struct rpcrdma_create_data_internal *cdata,
- unsigned int maxsegs)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
{
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ unsigned int maxsegs = ia->ri_max_segs;
+
ia->ri_max_inline_write = cdata->inline_wsize -
rpcrdma_max_call_header_size(maxsegs);
ia->ri_max_inline_read = cdata->inline_rsize -
@@ -155,42 +149,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
}
-static int
-rpcrdma_tail_pullup(struct xdr_buf *buf)
-{
- size_t tlen = buf->tail[0].iov_len;
- size_t skip = tlen & 3;
-
- /* Do not include the tail if it is only an XDR pad */
- if (tlen < 4)
- return 0;
-
- /* xdr_write_pages() adds a pad at the beginning of the tail
- * if the content in "buf->pages" is unaligned. Force the
- * tail's actual content to land at the next XDR position
- * after the head instead.
- */
- if (skip) {
- unsigned char *src, *dst;
- unsigned int count;
-
- src = buf->tail[0].iov_base;
- dst = buf->head[0].iov_base;
- dst += buf->head[0].iov_len;
-
- src += skip;
- tlen -= skip;
-
- dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n",
- __func__, skip, dst, src, tlen);
-
- for (count = tlen; count; count--)
- *dst++ = *src++;
- }
-
- return tlen;
-}
-
/* Split "vec" on page boundaries into segments. FMR registers pages,
* not a byte range. Other modes coalesce these segments into a single
* MR when they can.
@@ -229,7 +187,8 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
static int
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
- enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
+ enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
+ bool reminv_expected)
{
int len, n, p, page_base;
struct page **ppages;
@@ -271,6 +230,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
if (type == rpcrdma_readch)
return n;
+ /* When encoding the Write list, some servers need to see an extra
+ * segment for odd-length Write chunks. The upper layer provides
+ * space in the tail iovec for this purpose.
+ */
+ if (type == rpcrdma_writech && reminv_expected)
+ return n;
+
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
* xdr pad bytes, saving the server an RDMA operation. */
@@ -327,7 +293,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
if (rtype == rpcrdma_areadch)
pos = 0;
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -391,7 +357,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
- wtype, seg);
+ wtype, seg,
+ r_xprt->rx_ia.ri_reminv_expected);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -456,7 +423,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
}
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+ r_xprt->rx_ia.ri_reminv_expected);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -491,74 +459,184 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
return iptr;
}
-/*
- * Copy write data inline.
- * This function is used for "small" requests. Data which is passed
- * to RPC via iovecs (or page list) is copied directly into the
- * pre-registered memory buffer for this request. For small amounts
- * of data, this is efficient. The cutoff value is tunable.
+/* Prepare the RPC-over-RDMA header SGE.
+ */
+static bool
+rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+ u32 len)
+{
+ struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
+ struct ib_sge *sge = &req->rl_send_sge[0];
+
+ if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) {
+ if (!__rpcrdma_dma_map_regbuf(ia, rb))
+ return false;
+ sge->addr = rdmab_addr(rb);
+ sge->lkey = rdmab_lkey(rb);
+ }
+ sge->length = len;
+
+ ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
+ sge->length, DMA_TO_DEVICE);
+ req->rl_send_wr.num_sge++;
+ return true;
+}
+
+/* Prepare the Send SGEs. The head and tail iovec, and each entry
+ * in the page list, gets its own SGE.
*/
-static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
+static bool
+rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+ struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
{
- int i, npages, curlen;
- int copy_len;
- unsigned char *srcp, *destp;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
- int page_base;
- struct page **ppages;
+ unsigned int sge_no, page_base, len, remaining;
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+ struct ib_device *device = ia->ri_device;
+ struct ib_sge *sge = req->rl_send_sge;
+ u32 lkey = ia->ri_pd->local_dma_lkey;
+ struct page *page, **ppages;
+
+ /* The head iovec is straightforward, as it is already
+ * DMA-mapped. Sync the content that has changed.
+ */
+ if (!rpcrdma_dma_map_regbuf(ia, rb))
+ return false;
+ sge_no = 1;
+ sge[sge_no].addr = rdmab_addr(rb);
+ sge[sge_no].length = xdr->head[0].iov_len;
+ sge[sge_no].lkey = rdmab_lkey(rb);
+ ib_dma_sync_single_for_device(device, sge[sge_no].addr,
+ sge[sge_no].length, DMA_TO_DEVICE);
+
+ /* If there is a Read chunk, the page list is being handled
+ * via explicit RDMA, and thus is skipped here. However, the
+ * tail iovec may include an XDR pad for the page list, as
+ * well as additional content, and may not reside in the
+ * same page as the head iovec.
+ */
+ if (rtype == rpcrdma_readch) {
+ len = xdr->tail[0].iov_len;
- destp = rqst->rq_svec[0].iov_base;
- curlen = rqst->rq_svec[0].iov_len;
- destp += curlen;
+ /* Do not include the tail if it is only an XDR pad */
+ if (len < 4)
+ goto out;
- dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n",
- __func__, destp, rqst->rq_slen, curlen);
+ page = virt_to_page(xdr->tail[0].iov_base);
+ page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
- copy_len = rqst->rq_snd_buf.page_len;
+ /* If the content in the page list is an odd length,
+ * xdr_write_pages() has added a pad at the beginning
+ * of the tail iovec. Force the tail's non-pad content
+ * to land at the next XDR position in the Send message.
+ */
+ page_base += len & 3;
+ len -= len & 3;
+ goto map_tail;
+ }
- if (rqst->rq_snd_buf.tail[0].iov_len) {
- curlen = rqst->rq_snd_buf.tail[0].iov_len;
- if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
- memmove(destp + copy_len,
- rqst->rq_snd_buf.tail[0].iov_base, curlen);
- r_xprt->rx_stats.pullup_copy_count += curlen;
+ /* If there is a page list present, temporarily DMA map
+ * and prepare an SGE for each page to be sent.
+ */
+ if (xdr->page_len) {
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_base = xdr->page_base & ~PAGE_MASK;
+ remaining = xdr->page_len;
+ while (remaining) {
+ sge_no++;
+ if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
+ goto out_mapping_overflow;
+
+ len = min_t(u32, PAGE_SIZE - page_base, remaining);
+ sge[sge_no].addr = ib_dma_map_page(device, *ppages,
+ page_base, len,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(device, sge[sge_no].addr))
+ goto out_mapping_err;
+ sge[sge_no].length = len;
+ sge[sge_no].lkey = lkey;
+
+ req->rl_mapped_sges++;
+ ppages++;
+ remaining -= len;
+ page_base = 0;
}
- dprintk("RPC: %s: tail destp 0x%p len %d\n",
- __func__, destp + copy_len, curlen);
- rqst->rq_svec[0].iov_len += curlen;
}
- r_xprt->rx_stats.pullup_copy_count += copy_len;
- page_base = rqst->rq_snd_buf.page_base;
- ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
- page_base &= ~PAGE_MASK;
- npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
- for (i = 0; copy_len && i < npages; i++) {
- curlen = PAGE_SIZE - page_base;
- if (curlen > copy_len)
- curlen = copy_len;
- dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n",
- __func__, i, destp, copy_len, curlen);
- srcp = kmap_atomic(ppages[i]);
- memcpy(destp, srcp+page_base, curlen);
- kunmap_atomic(srcp);
- rqst->rq_svec[0].iov_len += curlen;
- destp += curlen;
- copy_len -= curlen;
- page_base = 0;
+ /* The tail iovec is not always constructed in the same
+ * page where the head iovec resides (see, for example,
+ * gss_wrap_req_priv). To neatly accommodate that case,
+ * DMA map it separately.
+ */
+ if (xdr->tail[0].iov_len) {
+ page = virt_to_page(xdr->tail[0].iov_base);
+ page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+ len = xdr->tail[0].iov_len;
+
+map_tail:
+ sge_no++;
+ sge[sge_no].addr = ib_dma_map_page(device, page,
+ page_base, len,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(device, sge[sge_no].addr))
+ goto out_mapping_err;
+ sge[sge_no].length = len;
+ sge[sge_no].lkey = lkey;
+ req->rl_mapped_sges++;
}
- /* header now contains entire send message */
+
+out:
+ req->rl_send_wr.num_sge = sge_no + 1;
+ return true;
+
+out_mapping_overflow:
+ pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
+ return false;
+
+out_mapping_err:
+ pr_err("rpcrdma: Send mapping error\n");
+ return false;
+}
+
+bool
+rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+ u32 hdrlen, struct xdr_buf *xdr,
+ enum rpcrdma_chunktype rtype)
+{
+ req->rl_send_wr.num_sge = 0;
+ req->rl_mapped_sges = 0;
+
+ if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen))
+ goto out_map;
+
+ if (rtype != rpcrdma_areadch)
+ if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype))
+ goto out_map;
+
+ return true;
+
+out_map:
+ pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+ return false;
+}
+
+void
+rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+ struct ib_device *device = ia->ri_device;
+ struct ib_sge *sge;
+ int count;
+
+ sge = &req->rl_send_sge[2];
+ for (count = req->rl_mapped_sges; count--; sge++)
+ ib_dma_unmap_page(device, sge->addr, sge->length,
+ DMA_TO_DEVICE);
+ req->rl_mapped_sges = 0;
}
/*
* Marshal a request: the primary job of this routine is to choose
* the transfer modes. See comments below.
*
- * Prepares up to two IOVs per Call message:
- *
- * [0] -- RPC RDMA header
- * [1] -- the RPC header/data
- *
* Returns zero on success, otherwise a negative errno.
*/
@@ -626,12 +704,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rpcrdma_args_inline(r_xprt, rqst)) {
rtype = rpcrdma_noch;
- rpcrdma_inline_pullup(rqst);
- rpclen = rqst->rq_svec[0].iov_len;
+ rpclen = rqst->rq_snd_buf.len;
} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
- rpclen = rqst->rq_svec[0].iov_len;
- rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
+ rpclen = rqst->rq_snd_buf.head[0].iov_len +
+ rqst->rq_snd_buf.tail[0].iov_len;
} else {
r_xprt->rx_stats.nomsg_call_count++;
headerp->rm_type = htonl(RDMA_NOMSG);
@@ -673,34 +750,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
goto out_unmap;
hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
- if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
- goto out_overflow;
-
dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
rqst->rq_task->tk_pid, __func__,
transfertypes[rtype], transfertypes[wtype],
hdrlen, rpclen);
- req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
- req->rl_send_iov[0].length = hdrlen;
- req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
- req->rl_niovs = 1;
- if (rtype == rpcrdma_areadch)
- return 0;
-
- req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
- req->rl_send_iov[1].length = rpclen;
- req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
- req->rl_niovs = 2;
+ if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
+ &rqst->rq_snd_buf, rtype)) {
+ iptr = ERR_PTR(-EIO);
+ goto out_unmap;
+ }
return 0;
-out_overflow:
- pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
- hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
- iptr = ERR_PTR(-EIO);
-
out_unmap:
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
return PTR_ERR(iptr);
@@ -725,7 +786,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
ifdebug(FACILITY) {
u64 off;
xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
- dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n",
+ dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n",
__func__,
be32_to_cpu(seg->rs_length),
(unsigned long long)off,
@@ -845,28 +906,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
return fixup_copy_count;
}
-void
-rpcrdma_connect_worker(struct work_struct *work)
-{
- struct rpcrdma_ep *ep =
- container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
- struct rpcrdma_xprt *r_xprt =
- container_of(ep, struct rpcrdma_xprt, rx_ep);
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-
- spin_lock_bh(&xprt->transport_lock);
- if (++xprt->connect_cookie == 0) /* maintain a reserved value */
- ++xprt->connect_cookie;
- if (ep->rep_connected > 0) {
- if (!xprt_test_and_set_connected(xprt))
- xprt_wake_pending_tasks(xprt, 0);
- } else {
- if (xprt_test_and_clear_connected(xprt))
- xprt_wake_pending_tasks(xprt, -ENOTCONN);
- }
- spin_unlock_bh(&xprt->transport_lock);
-}
-
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
@@ -898,26 +937,16 @@ rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-/*
- * This function is called when an async event is posted to
- * the connection which changes the connection state. All it
- * does at this point is mark the connection up/down, the rpc
- * timers do the rest.
- */
-void
-rpcrdma_conn_func(struct rpcrdma_ep *ep)
-{
- schedule_delayed_work(&ep->rep_connect_worker, 0);
-}
-
/* Process received RPC/RDMA messages.
*
* Errors must result in the RPC task either being awakened, or
* allowed to timeout, to discover the errors at that time.
*/
void
-rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+rpcrdma_reply_handler(struct work_struct *work)
{
+ struct rpcrdma_rep *rep =
+ container_of(work, struct rpcrdma_rep, rr_work);
struct rpcrdma_msg *headerp;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
@@ -1132,6 +1161,6 @@ out_duplicate:
repost:
r_xprt->rx_stats.bad_reply_count++;
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
rpcrdma_recv_buffer_put(rep);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a2a7519b0f23..288e35c2d8f4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -129,7 +129,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
ret = -EIO;
goto out_unmap;
}
- atomic_inc(&rdma->sc_dma_used);
+ svc_rdma_count_mappings(rdma, ctxt);
memset(&send_wr, 0, sizeof(send_wr));
ctxt->cqe.done = svc_rdma_wc_send;
@@ -159,35 +159,40 @@ out_unmap:
/* Server-side transport endpoint wants a whole page for its send
* buffer. The client RPC code constructs the RPC header in this
* buffer before it invokes ->send_request.
- *
- * Returns NULL if there was a temporary allocation failure.
*/
-static void *
-xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+static int
+xprt_rdma_bc_allocate(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
- struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
- struct svcxprt_rdma *rdma;
+ size_t size = rqst->rq_callsize;
struct page *page;
- rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
-
- /* Prevent an infinite loop: try to make this case work */
- if (size > PAGE_SIZE)
+ if (size > PAGE_SIZE) {
WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
size);
+ return -EINVAL;
+ }
+ /* svc_rdma_sendto releases this page */
page = alloc_page(RPCRDMA_DEF_GFP);
if (!page)
- return NULL;
+ return -ENOMEM;
+ rqst->rq_buffer = page_address(page);
- return page_address(page);
+ rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP);
+ if (!rqst->rq_rbuffer) {
+ put_page(page);
+ return -ENOMEM;
+ }
+ return 0;
}
static void
-xprt_rdma_bc_free(void *buffer)
+xprt_rdma_bc_free(struct rpc_task *task)
{
- /* No-op: ctxt and page have already been freed. */
+ struct rpc_rqst *rqst = task->tk_rqstp;
+
+ kfree(rqst->rq_rbuffer);
}
static int
@@ -350,6 +355,7 @@ xprt_setup_rdma_bc(struct xprt_create *args)
out_fail:
xprt_rdma_free_addresses(xprt);
args->bc_xprt->xpt_bc_xprt = NULL;
+ args->bc_xprt->xpt_bc_xps = NULL;
xprt_put(xprt);
xprt_free(xprt);
return ERR_PTR(-EINVAL);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2c25606f2561..172b537f8cfc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -159,7 +159,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
ctxt->sge[pno].addr);
if (ret)
goto err;
- atomic_inc(&xprt->sc_dma_used);
+ svc_rdma_count_mappings(xprt, ctxt);
ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->sge[pno].length = len;
@@ -279,7 +279,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
frmr->sg);
return -ENOMEM;
}
- atomic_inc(&xprt->sc_dma_used);
n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
if (unlikely(n != frmr->sg_nents)) {
@@ -348,8 +347,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
atomic_inc(&rdma_stat_read);
return ret;
err:
- ib_dma_unmap_sg(xprt->sc_cm_id->device,
- frmr->sg, frmr->sg_nents, frmr->direction);
svc_rdma_put_context(ctxt, 0);
svc_rdma_put_frmr(xprt, frmr);
return ret;
@@ -374,9 +371,7 @@ rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head,
u32 position, u32 byte_count, u32 page_offset, int page_no)
{
char *srcp, *destp;
- int ret;
- ret = 0;
srcp = head->arg.head[0].iov_base + position;
byte_count = head->arg.head[0].iov_len - position;
if (byte_count > PAGE_SIZE) {
@@ -415,6 +410,20 @@ done:
return 1;
}
+/* Returns the address of the first read chunk or <nul> if no read chunk
+ * is present
+ */
+static struct rpcrdma_read_chunk *
+svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *ch =
+ (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+
+ if (ch->rc_discrim == xdr_zero)
+ return NULL;
+ return ch;
+}
+
static int rdma_read_chunks(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rmsgp,
struct svc_rqst *rqstp,
@@ -627,8 +636,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto defer;
goto out;
}
- dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
- ctxt, rdma_xprt, rqstp, ctxt->wc_status);
+ dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n",
+ ctxt, rdma_xprt, rqstp);
atomic_inc(&rdma_stat_recv);
/* Build up the XDR from the receive buffers. */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 54d533300620..ad4d286a83c5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -153,76 +153,68 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
return dma_addr;
}
-/* Returns the address of the first read chunk or <nul> if no read chunk
- * is present
+/* Parse the RPC Call's transport header.
*/
-struct rpcrdma_read_chunk *
-svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
+ struct rpcrdma_write_array **write,
+ struct rpcrdma_write_array **reply)
{
- struct rpcrdma_read_chunk *ch =
- (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ __be32 *p;
- if (ch->rc_discrim == xdr_zero)
- return NULL;
- return ch;
-}
+ p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
-/* Returns the address of the first read write array element or <nul>
- * if no write array list is present
- */
-static struct rpcrdma_write_array *
-svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
-{
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
- rmsgp->rm_body.rm_chunks[1] == xdr_zero)
- return NULL;
- return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
+ /* Read list */
+ while (*p++ != xdr_zero)
+ p += 5;
+
+ /* Write list */
+ if (*p != xdr_zero) {
+ *write = (struct rpcrdma_write_array *)p;
+ while (*p++ != xdr_zero)
+ p += 1 + be32_to_cpu(*p) * 4;
+ } else {
+ *write = NULL;
+ p++;
+ }
+
+ /* Reply chunk */
+ if (*p != xdr_zero)
+ *reply = (struct rpcrdma_write_array *)p;
+ else
+ *reply = NULL;
}
-/* Returns the address of the first reply array element or <nul> if no
- * reply array is present
+/* RPC-over-RDMA Version One private extension: Remote Invalidation.
+ * Responder's choice: requester signals it can handle Send With
+ * Invalidate, and responder chooses one rkey to invalidate.
+ *
+ * Find a candidate rkey to invalidate when sending a reply. Picks the
+ * first rkey it finds in the chunks lists.
+ *
+ * Returns zero if RPC's chunk lists are empty.
*/
-static struct rpcrdma_write_array *
-svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp,
- struct rpcrdma_write_array *wr_ary)
+static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_write_array *wr_ary,
+ struct rpcrdma_write_array *rp_ary)
{
- struct rpcrdma_read_chunk *rch;
- struct rpcrdma_write_array *rp_ary;
-
- /* XXX: Need to fix when reply chunk may occur with read list
- * and/or write list.
- */
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
- rmsgp->rm_body.rm_chunks[1] != xdr_zero)
- return NULL;
-
- rch = svc_rdma_get_read_chunk(rmsgp);
- if (rch) {
- while (rch->rc_discrim != xdr_zero)
- rch++;
-
- /* The reply chunk follows an empty write array located
- * at 'rc_position' here. The reply array is at rc_target.
- */
- rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
- goto found_it;
- }
+ struct rpcrdma_read_chunk *rd_ary;
+ struct rpcrdma_segment *arg_ch;
- if (wr_ary) {
- int chunk = be32_to_cpu(wr_ary->wc_nchunks);
+ rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
+ if (rd_ary->rc_discrim != xdr_zero)
+ return be32_to_cpu(rd_ary->rc_target.rs_handle);
- rp_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[chunk].wc_target.rs_length;
- goto found_it;
+ if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
+ arg_ch = &wr_ary->wc_array[0].wc_target;
+ return be32_to_cpu(arg_ch->rs_handle);
}
- /* No read list, no write list */
- rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2];
+ if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
+ arg_ch = &rp_ary->wc_array[0].wc_target;
+ return be32_to_cpu(arg_ch->rs_handle);
+ }
- found_it:
- if (rp_ary->wc_discrim == xdr_zero)
- return NULL;
- return rp_ary;
+ return 0;
}
/* Assumptions:
@@ -280,7 +272,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
sge[sge_no].addr))
goto err;
- atomic_inc(&xprt->sc_dma_used);
+ svc_rdma_count_mappings(xprt, ctxt);
sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->count++;
sge_off = 0;
@@ -464,7 +456,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
struct page *page,
struct rpcrdma_msg *rdma_resp,
struct svc_rdma_req_map *vec,
- int byte_count)
+ int byte_count,
+ u32 inv_rkey)
{
struct svc_rdma_op_ctxt *ctxt;
struct ib_send_wr send_wr;
@@ -489,7 +482,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->sge[0].length, DMA_TO_DEVICE);
if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
goto err;
- atomic_inc(&rdma->sc_dma_used);
+ svc_rdma_count_mappings(rdma, ctxt);
ctxt->direction = DMA_TO_DEVICE;
@@ -505,7 +498,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
if (ib_dma_mapping_error(rdma->sc_cm_id->device,
ctxt->sge[sge_no].addr))
goto err;
- atomic_inc(&rdma->sc_dma_used);
+ svc_rdma_count_mappings(rdma, ctxt);
ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
ctxt->sge[sge_no].length = sge_bytes;
}
@@ -523,23 +516,9 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
ctxt->count++;
rqstp->rq_respages[page_no] = NULL;
- /*
- * If there are more pages than SGE, terminate SGE
- * list so that svc_rdma_unmap_dma doesn't attempt to
- * unmap garbage.
- */
- if (page_no+1 >= sge_no)
- ctxt->sge[page_no+1].length = 0;
}
rqstp->rq_next_page = rqstp->rq_respages + 1;
- /* The loop above bumps sc_dma_used for each sge. The
- * xdr_buf.tail gets a separate sge, but resides in the
- * same page as xdr_buf.head. Don't count it twice.
- */
- if (sge_no > ctxt->count)
- atomic_dec(&rdma->sc_dma_used);
-
if (sge_no > rdma->sc_max_sge) {
pr_err("svcrdma: Too many sges (%d)\n", sge_no);
goto err;
@@ -549,7 +528,11 @@ static int send_reply(struct svcxprt_rdma *rdma,
send_wr.wr_cqe = &ctxt->cqe;
send_wr.sg_list = ctxt->sge;
send_wr.num_sge = sge_no;
- send_wr.opcode = IB_WR_SEND;
+ if (inv_rkey) {
+ send_wr.opcode = IB_WR_SEND_WITH_INV;
+ send_wr.ex.invalidate_rkey = inv_rkey;
+ } else
+ send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
ret = svc_rdma_send(rdma, &send_wr);
@@ -581,6 +564,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
int inline_bytes;
struct page *res_page;
struct svc_rdma_req_map *vec;
+ u32 inv_rkey;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -588,8 +572,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
* places this at the start of page 0.
*/
rdma_argp = page_address(rqstp->rq_pages[0]);
- wr_ary = svc_rdma_get_write_array(rdma_argp);
- rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
+ svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
+
+ inv_rkey = 0;
+ if (rdma->sc_snd_w_inv)
+ inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
/* Build an req vec for the XDR */
vec = svc_rdma_get_req_map(rdma);
@@ -598,7 +585,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
goto err0;
inline_bytes = rqstp->rq_res.len;
- /* Create the RDMA response header */
+ /* Create the RDMA response header. xprt->xpt_mutex,
+ * acquired in svc_send(), serializes RPC replies. The
+ * code path below that inserts the credit grant value
+ * into each transport header runs only inside this
+ * critical section.
+ */
ret = -ENOMEM;
res_page = alloc_page(GFP_KERNEL);
if (!res_page)
@@ -633,9 +625,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
goto err1;
ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
- inline_bytes);
+ inline_bytes, inv_rkey);
if (ret < 0)
- goto err1;
+ goto err0;
svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: send_reply returns %d\n", ret);
@@ -692,7 +684,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
svc_rdma_put_context(ctxt, 1);
return;
}
- atomic_inc(&xprt->sc_dma_used);
+ svc_rdma_count_mappings(xprt, ctxt);
/* Prepare SEND WR */
memset(&err_wr, 0, sizeof(err_wr));
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index dd9440137834..39652d390a9c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -41,6 +41,7 @@
*/
#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/interrupt.h>
@@ -67,6 +68,7 @@ static void svc_rdma_detach(struct svc_xprt *xprt);
static void svc_rdma_free(struct svc_xprt *xprt);
static int svc_rdma_has_wspace(struct svc_xprt *xprt);
static int svc_rdma_secure_port(struct svc_rqst *);
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
static struct svc_xprt_ops svc_rdma_ops = {
.xpo_create = svc_rdma_create,
@@ -79,6 +81,7 @@ static struct svc_xprt_ops svc_rdma_ops = {
.xpo_has_wspace = svc_rdma_has_wspace,
.xpo_accept = svc_rdma_accept,
.xpo_secure_port = svc_rdma_secure_port,
+ .xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt,
};
struct svc_xprt_class svc_rdma_class = {
@@ -198,6 +201,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
out:
ctxt->count = 0;
+ ctxt->mapped_sges = 0;
ctxt->frmr = NULL;
return ctxt;
@@ -221,22 +225,24 @@ out_empty:
void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
{
struct svcxprt_rdma *xprt = ctxt->xprt;
- int i;
- for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+ struct ib_device *device = xprt->sc_cm_id->device;
+ u32 lkey = xprt->sc_pd->local_dma_lkey;
+ unsigned int i;
+
+ for (i = 0; i < ctxt->mapped_sges; i++) {
/*
* Unmap the DMA addr in the SGE if the lkey matches
* the local_dma_lkey, otherwise, ignore it since it is
* an FRMR lkey and will be unmapped later when the
* last WR that uses it completes.
*/
- if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
- atomic_dec(&xprt->sc_dma_used);
- ib_dma_unmap_page(xprt->sc_cm_id->device,
+ if (ctxt->sge[i].lkey == lkey)
+ ib_dma_unmap_page(device,
ctxt->sge[i].addr,
ctxt->sge[i].length,
ctxt->direction);
- }
}
+ ctxt->mapped_sges = 0;
}
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -390,7 +396,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wc->wr_cqe and wc->status are reliable */
ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
- ctxt->wc_status = wc->status;
svc_rdma_unmap_dma(ctxt);
if (wc->status != IB_WC_SUCCESS)
@@ -428,7 +433,7 @@ static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt,
goto err;
out:
- atomic_dec(&xprt->sc_sq_count);
+ atomic_inc(&xprt->sc_sq_avail);
wake_up(&xprt->sc_send_wait);
return;
@@ -600,7 +605,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
goto err_put_ctxt;
- atomic_inc(&xprt->sc_dma_used);
+ svc_rdma_count_mappings(xprt, ctxt);
ctxt->sge[sge_no].addr = pa;
ctxt->sge[sge_no].length = PAGE_SIZE;
ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
@@ -642,6 +647,26 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
return ret;
}
+static void
+svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt,
+ struct rdma_conn_param *param)
+{
+ const struct rpcrdma_connect_private *pmsg = param->private_data;
+
+ if (pmsg &&
+ pmsg->cp_magic == rpcrdma_cmp_magic &&
+ pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+ newxprt->sc_snd_w_inv = pmsg->cp_flags &
+ RPCRDMA_CMP_F_SND_W_INV_OK;
+
+ dprintk("svcrdma: client send_size %u, recv_size %u "
+ "remote inv %ssupported\n",
+ rpcrdma_decode_buffer_size(pmsg->cp_send_size),
+ rpcrdma_decode_buffer_size(pmsg->cp_recv_size),
+ newxprt->sc_snd_w_inv ? "" : "un");
+ }
+}
+
/*
* This function handles the CONNECT_REQUEST event on a listening
* endpoint. It is passed the cma_id for the _new_ connection. The context in
@@ -653,7 +678,8 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
* will call the recvfrom method on the listen xprt which will accept the new
* connection.
*/
-static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
+static void handle_connect_req(struct rdma_cm_id *new_cma_id,
+ struct rdma_conn_param *param)
{
struct svcxprt_rdma *listen_xprt = new_cma_id->context;
struct svcxprt_rdma *newxprt;
@@ -669,9 +695,10 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
new_cma_id->context = newxprt;
dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
newxprt, newxprt->sc_cm_id, listen_xprt);
+ svc_rdma_parse_connect_private(newxprt, param);
/* Save client advertised inbound read limit for use later in accept. */
- newxprt->sc_ord = client_ird;
+ newxprt->sc_ord = param->initiator_depth;
/* Set the local and remote addresses in the transport */
sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
@@ -706,8 +733,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
"event = %s (%d)\n", cma_id, cma_id->context,
rdma_event_msg(event->event), event->event);
- handle_connect_req(cma_id,
- event->param.conn.initiator_depth);
+ handle_connect_req(cma_id, &event->param.conn);
break;
case RDMA_CM_EVENT_ESTABLISHED:
@@ -917,7 +943,6 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
if (frmr) {
ib_dma_unmap_sg(rdma->sc_cm_id->device,
frmr->sg, frmr->sg_nents, frmr->direction);
- atomic_dec(&rdma->sc_dma_used);
spin_lock_bh(&rdma->sc_frmr_q_lock);
WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
@@ -941,8 +966,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct svcxprt_rdma *listen_rdma;
struct svcxprt_rdma *newxprt = NULL;
struct rdma_conn_param conn_param;
+ struct rpcrdma_connect_private pmsg;
struct ib_qp_init_attr qp_attr;
struct ib_device *dev;
+ struct sockaddr *sap;
unsigned int i;
int ret = 0;
@@ -980,6 +1007,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_rq_depth = newxprt->sc_max_requests +
newxprt->sc_max_bc_requests;
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+ atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
if (!svc_rdma_prealloc_ctxts(newxprt))
goto errout;
@@ -993,7 +1021,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
- newxprt->sc_pd = ib_alloc_pd(dev);
+ newxprt->sc_pd = ib_alloc_pd(dev, 0);
if (IS_ERR(newxprt->sc_pd)) {
dprintk("svcrdma: error creating PD for connect request\n");
goto errout;
@@ -1022,18 +1050,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = newxprt->sc_sq_cq;
qp_attr.recv_cq = newxprt->sc_rq_cq;
- dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
- " cm_id->device=%p, sc_pd->device=%p\n"
- " cap.max_send_wr = %d\n"
- " cap.max_recv_wr = %d\n"
- " cap.max_send_sge = %d\n"
- " cap.max_recv_sge = %d\n",
- newxprt->sc_cm_id, newxprt->sc_pd,
- dev, newxprt->sc_pd->device,
- qp_attr.cap.max_send_wr,
- qp_attr.cap.max_recv_wr,
- qp_attr.cap.max_send_sge,
- qp_attr.cap.max_recv_sge);
+ dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n",
+ newxprt->sc_cm_id, newxprt->sc_pd);
+ dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
+ qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
+ dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
+ qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
if (ret) {
@@ -1070,7 +1092,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dev->attrs.max_fast_reg_page_list_len;
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
newxprt->sc_reader = rdma_read_chunk_frmr;
- }
+ } else
+ newxprt->sc_snd_w_inv = false;
/*
* Determine if a DMA MR is required and if so, what privs are required
@@ -1094,11 +1117,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/* Swap out the handler */
newxprt->sc_cm_id->event_handler = rdma_cma_handler;
+ /* Construct RDMA-CM private message */
+ pmsg.cp_magic = rpcrdma_cmp_magic;
+ pmsg.cp_version = RPCRDMA_CMP_VERSION;
+ pmsg.cp_flags = 0;
+ pmsg.cp_send_size = pmsg.cp_recv_size =
+ rpcrdma_encode_buffer_size(newxprt->sc_max_req_size);
+
/* Accept Connection */
set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
memset(&conn_param, 0, sizeof conn_param);
conn_param.responder_resources = 0;
conn_param.initiator_depth = newxprt->sc_ord;
+ conn_param.private_data = &pmsg;
+ conn_param.private_data_len = sizeof(pmsg);
ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
if (ret) {
dprintk("svcrdma: failed to accept new connection, ret=%d\n",
@@ -1106,31 +1138,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
goto errout;
}
- dprintk("svcrdma: new connection %p accepted with the following "
- "attributes:\n"
- " local_ip : %pI4\n"
- " local_port : %d\n"
- " remote_ip : %pI4\n"
- " remote_port : %d\n"
- " max_sge : %d\n"
- " max_sge_rd : %d\n"
- " sq_depth : %d\n"
- " max_requests : %d\n"
- " ord : %d\n",
- newxprt,
- &((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.src_addr)->sin_addr.s_addr,
- ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.src_addr)->sin_port),
- &((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.dst_addr)->sin_addr.s_addr,
- ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.dst_addr)->sin_port),
- newxprt->sc_max_sge,
- newxprt->sc_max_sge_rd,
- newxprt->sc_sq_depth,
- newxprt->sc_max_requests,
- newxprt->sc_ord);
+ dprintk("svcrdma: new connection %p accepted:\n", newxprt);
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+ dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap));
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+ dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap));
+ dprintk(" max_sge : %d\n", newxprt->sc_max_sge);
+ dprintk(" max_sge_rd : %d\n", newxprt->sc_max_sge_rd);
+ dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth);
+ dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
+ dprintk(" ord : %d\n", newxprt->sc_ord);
return &newxprt->sc_xprt;
@@ -1184,9 +1201,9 @@ static void __svc_rdma_free(struct work_struct *work)
ib_drain_qp(rdma->sc_qp);
/* We should only be called from kref_put */
- if (atomic_read(&xprt->xpt_ref.refcount) != 0)
+ if (kref_read(&xprt->xpt_ref) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
- atomic_read(&xprt->xpt_ref.refcount));
+ kref_read(&xprt->xpt_ref));
/*
* Destroy queued, but not processed read completions. Note
@@ -1217,9 +1234,6 @@ static void __svc_rdma_free(struct work_struct *work)
if (rdma->sc_ctxt_used != 0)
pr_err("svcrdma: ctxt still in use? (%d)\n",
rdma->sc_ctxt_used);
- if (atomic_read(&rdma->sc_dma_used) != 0)
- pr_err("svcrdma: dma still in use? (%d)\n",
- atomic_read(&rdma->sc_dma_used));
/* Final put of backchannel client transport */
if (xprt->xpt_bc_xprt) {
@@ -1279,6 +1293,10 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp)
return 1;
}
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
+
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
struct ib_send_wr *bad_wr, *n_wr;
@@ -1295,15 +1313,13 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
- spin_lock_bh(&xprt->sc_lock);
- if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
- spin_unlock_bh(&xprt->sc_lock);
+ if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) {
atomic_inc(&rdma_stat_sq_starve);
/* Wait until SQ WR available if SQ still full */
+ atomic_add(wr_count, &xprt->sc_sq_avail);
wait_event(xprt->sc_send_wait,
- atomic_read(&xprt->sc_sq_count) <
- xprt->sc_sq_depth);
+ atomic_read(&xprt->sc_sq_avail) > wr_count);
if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
return -ENOTCONN;
continue;
@@ -1313,21 +1329,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
svc_xprt_get(&xprt->sc_xprt);
/* Bump used SQ WR count and post */
- atomic_add(wr_count, &xprt->sc_sq_count);
ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
if (ret) {
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
- atomic_sub(wr_count, &xprt->sc_sq_count);
for (i = 0; i < wr_count; i ++)
svc_xprt_put(&xprt->sc_xprt);
- dprintk("svcrdma: failed to post SQ WR rc=%d, "
- "sc_sq_count=%d, sc_sq_depth=%d\n",
- ret, atomic_read(&xprt->sc_sq_count),
- xprt->sc_sq_depth);
- }
- spin_unlock_bh(&xprt->sc_lock);
- if (ret)
+ dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret);
+ dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n",
+ atomic_read(&xprt->sc_sq_avail),
+ xprt->sc_sq_depth);
wake_up(&xprt->sc_send_wait);
+ }
break;
}
return ret;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 81f0e879f019..534c178d2a7e 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = {
.data = &xprt_rdma_max_inline_read,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_inline_size,
.extra2 = &max_inline_size,
},
@@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = {
.data = &xprt_rdma_max_inline_write,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_inline_size,
.extra2 = &max_inline_size,
},
@@ -219,6 +219,34 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt)
}
}
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+ schedule_delayed_work(&ep->rep_connect_worker, 0);
+}
+
+void
+rpcrdma_connect_worker(struct work_struct *work)
+{
+ struct rpcrdma_ep *ep =
+ container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
+ struct rpcrdma_xprt *r_xprt =
+ container_of(ep, struct rpcrdma_xprt, rx_ep);
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (++xprt->connect_cookie == 0) /* maintain a reserved value */
+ ++xprt->connect_cookie;
+ if (ep->rep_connected > 0) {
+ if (!xprt_test_and_set_connected(xprt))
+ xprt_wake_pending_tasks(xprt, 0);
+ } else {
+ if (xprt_test_and_clear_connected(xprt))
+ xprt_wake_pending_tasks(xprt, -ENOTCONN);
+ }
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
static void
xprt_rdma_connect_worker(struct work_struct *work)
{
@@ -477,115 +505,153 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
}
}
-/*
- * The RDMA allocate/free functions need the task structure as a place
- * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
- * sequence.
+/* Allocate a fixed-size buffer in which to construct and send the
+ * RPC-over-RDMA header for this request.
+ */
+static bool
+rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ gfp_t flags)
+{
+ size_t size = RPCRDMA_HDRBUF_SIZE;
+ struct rpcrdma_regbuf *rb;
+
+ if (req->rl_rdmabuf)
+ return true;
+
+ rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+ if (IS_ERR(rb))
+ return false;
+
+ r_xprt->rx_stats.hardway_register_count += size;
+ req->rl_rdmabuf = rb;
+ return true;
+}
+
+static bool
+rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ size_t size, gfp_t flags)
+{
+ struct rpcrdma_regbuf *rb;
+
+ if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
+ return true;
+
+ rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+ if (IS_ERR(rb))
+ return false;
+
+ rpcrdma_free_regbuf(req->rl_sendbuf);
+ r_xprt->rx_stats.hardway_register_count += size;
+ req->rl_sendbuf = rb;
+ return true;
+}
+
+/* The rq_rcv_buf is used only if a Reply chunk is necessary.
+ * The decision to use a Reply chunk is made later in
+ * rpcrdma_marshal_req. This buffer is registered at that time.
+ *
+ * Otherwise, the associated RPC Reply arrives in a separate
+ * Receive buffer, arbitrarily chosen by the HCA. The buffer
+ * allocated here for the RPC Reply is not utilized in that
+ * case. See rpcrdma_inline_fixup.
*
- * The RPC layer allocates both send and receive buffers in the same call
- * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
- * We may register rq_rcv_buf when using reply chunks.
+ * A regbuf is used here to remember the buffer size.
*/
-static void *
-xprt_rdma_allocate(struct rpc_task *task, size_t size)
+static bool
+rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ size_t size, gfp_t flags)
{
- struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_regbuf *rb;
+
+ if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
+ return true;
+
+ rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags);
+ if (IS_ERR(rb))
+ return false;
+
+ rpcrdma_free_regbuf(req->rl_recvbuf);
+ r_xprt->rx_stats.hardway_register_count += size;
+ req->rl_recvbuf = rb;
+ return true;
+}
+
+/**
+ * xprt_rdma_allocate - allocate transport resources for an RPC
+ * @task: RPC task
+ *
+ * Return values:
+ * 0: Success; rq_buffer points to RPC buffer to use
+ * ENOMEM: Out of memory, call again later
+ * EIO: A permanent error occurred, do not retry
+ *
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual
+ * send/recv sequence.
+ *
+ * xprt_rdma_allocate provides buffers that are already mapped for
+ * DMA, and a local DMA lkey is provided for each.
+ */
+static int
+xprt_rdma_allocate(struct rpc_task *task)
+{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req;
- size_t min_size;
gfp_t flags;
req = rpcrdma_buffer_get(&r_xprt->rx_buf);
if (req == NULL)
- return NULL;
+ return -ENOMEM;
flags = RPCRDMA_DEF_GFP;
if (RPC_IS_SWAPPER(task))
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
- if (req->rl_rdmabuf == NULL)
- goto out_rdmabuf;
- if (req->rl_sendbuf == NULL)
- goto out_sendbuf;
- if (size > req->rl_sendbuf->rg_size)
- goto out_sendbuf;
-
-out:
- dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
- req->rl_connect_cookie = 0; /* our reserved value */
- req->rl_task = task;
- return req->rl_sendbuf->rg_base;
-
-out_rdmabuf:
- min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
- rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
- if (IS_ERR(rb))
+ if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
goto out_fail;
- req->rl_rdmabuf = rb;
-
-out_sendbuf:
- /* XDR encoding and RPC/RDMA marshaling of this request has not
- * yet occurred. Thus a lower bound is needed to prevent buffer
- * overrun during marshaling.
- *
- * RPC/RDMA marshaling may choose to send payload bearing ops
- * inline, if the result is smaller than the inline threshold.
- * The value of the "size" argument accounts for header
- * requirements but not for the payload in these cases.
- *
- * Likewise, allocate enough space to receive a reply up to the
- * size of the inline threshold.
- *
- * It's unlikely that both the send header and the received
- * reply will be large, but slush is provided here to allow
- * flexibility when marshaling.
- */
- min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
- min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
- if (size < min_size)
- size = min_size;
-
- rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
- if (IS_ERR(rb))
+ if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
+ goto out_fail;
+ if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
goto out_fail;
- rb->rg_owner = req;
- r_xprt->rx_stats.hardway_register_count += size;
- rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
- req->rl_sendbuf = rb;
- goto out;
+ dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
+ task->tk_pid, __func__, rqst->rq_callsize,
+ rqst->rq_rcvsize, req);
+
+ req->rl_connect_cookie = 0; /* our reserved value */
+ rpcrdma_set_xprtdata(rqst, req);
+ rqst->rq_buffer = req->rl_sendbuf->rg_base;
+ rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
+ return 0;
out_fail:
rpcrdma_buffer_put(req);
- return NULL;
+ return -ENOMEM;
}
-/*
- * This function returns all RDMA resources to the pool.
+/**
+ * xprt_rdma_free - release resources allocated by xprt_rdma_allocate
+ * @task: RPC task
+ *
+ * Caller guarantees rqst->rq_buffer is non-NULL.
*/
static void
-xprt_rdma_free(void *buffer)
+xprt_rdma_free(struct rpc_task *task)
{
- struct rpcrdma_req *req;
- struct rpcrdma_xprt *r_xprt;
- struct rpcrdma_regbuf *rb;
-
- if (buffer == NULL)
- return;
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
- req = rb->rg_owner;
if (req->rl_backchannel)
return;
- r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
-
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
- r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
- !RPC_IS_ASYNC(req->rl_task));
-
+ if (unlikely(!list_empty(&req->rl_registered)))
+ ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
+ rpcrdma_unmap_sges(ia, req);
rpcrdma_buffer_put(req);
}
@@ -620,7 +686,8 @@ xprt_rdma_send_request(struct rpc_task *task)
int rc = 0;
/* On retransmit, remove any previously registered chunks */
- r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+ if (unlikely(!list_empty(&req->rl_registered)))
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
rc = rpcrdma_marshal_req(rqst);
if (rc < 0)
@@ -685,10 +752,11 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
r_xprt->rx_stats.failed_marshal_count,
r_xprt->rx_stats.bad_reply_count,
r_xprt->rx_stats.nomsg_call_count);
- seq_printf(seq, "%lu %lu %lu\n",
+ seq_printf(seq, "%lu %lu %lu %lu\n",
r_xprt->rx_stats.mrs_recovered,
r_xprt->rx_stats.mrs_orphaned,
- r_xprt->rx_stats.mrs_allocated);
+ r_xprt->rx_stats.mrs_allocated,
+ r_xprt->rx_stats.local_inv_needed);
}
static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 799cce6cbe45..11d07748f699 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -103,9 +103,9 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
{
struct rpcrdma_ep *ep = context;
- pr_err("RPC: %s: %s on device %s ep %p\n",
- __func__, ib_event_msg(event->event),
- event->device->name, context);
+ pr_err("rpcrdma: %s on device %s ep %p\n",
+ ib_event_msg(event->event), event->device->name, context);
+
if (ep->rep_connected == 1) {
ep->rep_connected = -EIO;
rpcrdma_conn_func(ep);
@@ -129,15 +129,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
wc->status, wc->vendor_err);
}
-static void
-rpcrdma_receive_worker(struct work_struct *work)
-{
- struct rpcrdma_rep *rep =
- container_of(work, struct rpcrdma_rep, rr_work);
-
- rpcrdma_reply_handler(rep);
-}
-
/* Perform basic sanity checking to avoid using garbage
* to update the credit grant value.
*/
@@ -161,13 +152,13 @@ rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
}
/**
- * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC
+ * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
* @cq: completion queue (ignored)
* @wc: completed WR
*
*/
static void
-rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
+rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
@@ -185,6 +176,9 @@ rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
__func__, rep, wc->byte_len);
rep->rr_len = wc->byte_len;
+ rep->rr_wc_flags = wc->wc_flags;
+ rep->rr_inv_rkey = wc->ex.invalidate_rkey;
+
ib_dma_sync_single_for_cpu(rep->rr_device,
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
@@ -204,6 +198,36 @@ out_fail:
goto out_schedule;
}
+static void
+rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
+ struct rdma_conn_param *param)
+{
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ const struct rpcrdma_connect_private *pmsg = param->private_data;
+ unsigned int rsize, wsize;
+
+ /* Default settings for RPC-over-RDMA Version One */
+ r_xprt->rx_ia.ri_reminv_expected = false;
+ rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+ wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+
+ if (pmsg &&
+ pmsg->cp_magic == rpcrdma_cmp_magic &&
+ pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+ r_xprt->rx_ia.ri_reminv_expected = true;
+ rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
+ wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
+ }
+
+ if (rsize < cdata->inline_rsize)
+ cdata->inline_rsize = rsize;
+ if (wsize < cdata->inline_wsize)
+ cdata->inline_wsize = wsize;
+ dprintk("RPC: %s: max send %u, max recv %u\n",
+ __func__, cdata->inline_wsize, cdata->inline_rsize);
+ rpcrdma_set_max_header_sizes(r_xprt);
+}
+
static int
rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
@@ -244,6 +268,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
" (%d initiator)\n",
__func__, attr->max_dest_rd_atomic,
attr->max_rd_atomic);
+ rpcrdma_update_connect_private(xprt, &event->param.conn);
goto connected;
case RDMA_CM_EVENT_CONNECT_ERROR:
connstate = -ENOTCONN;
@@ -306,6 +331,7 @@ static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
{
+ unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
struct rdma_cm_id *id;
int rc;
@@ -327,8 +353,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
__func__, rc);
goto out;
}
- wait_for_completion_interruptible_timeout(&ia->ri_done,
- msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+ rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ if (rc < 0) {
+ dprintk("RPC: %s: wait() exited: %i\n",
+ __func__, rc);
+ goto out;
+ }
/* FIXME:
* Until xprtrdma supports DEVICE_REMOVAL, the provider must
@@ -351,8 +381,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
__func__, rc);
goto put;
}
- wait_for_completion_interruptible_timeout(&ia->ri_done,
- msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+ rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ if (rc < 0) {
+ dprintk("RPC: %s: wait() exited: %i\n",
+ __func__, rc);
+ goto put;
+ }
rc = ia->ri_async_rc;
if (rc)
goto put;
@@ -387,7 +421,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
ia->ri_device = ia->ri_id->device;
- ia->ri_pd = ib_alloc_pd(ia->ri_device);
+ ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
@@ -454,11 +488,12 @@ int
rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct rpcrdma_create_data_internal *cdata)
{
+ struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
struct ib_cq *sendcq, *recvcq;
unsigned int max_qp_wr;
int rc;
- if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
+ if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
dprintk("RPC: %s: insufficient sge's available\n",
__func__);
return -ENOMEM;
@@ -487,7 +522,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
- ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
+ ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -506,7 +541,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
if (ep->rep_cqinit <= 2)
ep->rep_cqinit = 0; /* always signal? */
- INIT_CQCOUNT(ep);
+ rpcrdma_init_cqcount(ep, 0);
init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
@@ -536,9 +571,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* Initialize cma parameters */
memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
- /* RPC/RDMA does not use private data */
- ep->rep_remote_cma.private_data = NULL;
- ep->rep_remote_cma.private_data_len = 0;
+ /* Prepare RDMA-CM private message */
+ pmsg->cp_magic = rpcrdma_cmp_magic;
+ pmsg->cp_version = RPCRDMA_CMP_VERSION;
+ pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
+ pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
+ pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
+ ep->rep_remote_cma.private_data = pmsg;
+ ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
@@ -849,6 +889,10 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
req->rl_cqe.done = rpcrdma_wc_send;
req->rl_buffer = &r_xprt->rx_buf;
INIT_LIST_HEAD(&req->rl_registered);
+ req->rl_send_wr.next = NULL;
+ req->rl_send_wr.wr_cqe = &req->rl_cqe;
+ req->rl_send_wr.sg_list = req->rl_send_sge;
+ req->rl_send_wr.opcode = IB_WR_SEND;
return req;
}
@@ -865,17 +909,21 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
- GFP_KERNEL);
+ rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
+ DMA_FROM_DEVICE, GFP_KERNEL);
if (IS_ERR(rep->rr_rdmabuf)) {
rc = PTR_ERR(rep->rr_rdmabuf);
goto out_free;
}
rep->rr_device = ia->ri_device;
- rep->rr_cqe.done = rpcrdma_receive_wc;
+ rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
- INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
+ INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
+ rep->rr_recv_wr.next = NULL;
+ rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
+ rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
+ rep->rr_recv_wr.num_sge = 1;
return rep;
out_free:
@@ -966,17 +1014,18 @@ rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
}
static void
-rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
+rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
{
- rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
+ rpcrdma_free_regbuf(rep->rr_rdmabuf);
kfree(rep);
}
void
-rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+rpcrdma_destroy_req(struct rpcrdma_req *req)
{
- rpcrdma_free_regbuf(ia, req->rl_sendbuf);
- rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
+ rpcrdma_free_regbuf(req->rl_recvbuf);
+ rpcrdma_free_regbuf(req->rl_sendbuf);
+ rpcrdma_free_regbuf(req->rl_rdmabuf);
kfree(req);
}
@@ -1009,15 +1058,13 @@ rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
- struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-
cancel_delayed_work_sync(&buf->rb_recovery_worker);
while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep;
rep = rpcrdma_buffer_get_rep_locked(buf);
- rpcrdma_destroy_rep(ia, rep);
+ rpcrdma_destroy_rep(rep);
}
buf->rb_send_count = 0;
@@ -1030,7 +1077,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
list_del(&req->rl_all);
spin_unlock(&buf->rb_reqslock);
- rpcrdma_destroy_req(ia, req);
+ rpcrdma_destroy_req(req);
spin_lock(&buf->rb_reqslock);
}
spin_unlock(&buf->rb_reqslock);
@@ -1129,7 +1176,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
struct rpcrdma_buffer *buffers = req->rl_buffer;
struct rpcrdma_rep *rep = req->rl_reply;
- req->rl_niovs = 0;
+ req->rl_send_wr.num_sge = 0;
req->rl_reply = NULL;
spin_lock(&buffers->rb_lock);
@@ -1171,70 +1218,81 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
spin_unlock(&buffers->rb_lock);
}
-/*
- * Wrappers for internal-use kmalloc memory registration, used by buffer code.
- */
-
/**
- * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
- * @ia: controlling rpcrdma_ia
+ * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
* @size: size of buffer to be allocated, in bytes
+ * @direction: direction of data movement
* @flags: GFP flags
*
- * Returns pointer to private header of an area of internally
- * registered memory, or an ERR_PTR. The registered buffer follows
- * the end of the private header.
+ * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
+ * can be persistently DMA-mapped for I/O.
*
* xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
- * receiving the payload of RDMA RECV operations. regbufs are not
- * used for RDMA READ/WRITE operations, thus are registered only for
- * LOCAL access.
+ * receiving the payload of RDMA RECV operations. During Long Calls
+ * or Replies they may be registered externally via ro_map.
*/
struct rpcrdma_regbuf *
-rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
+rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
+ gfp_t flags)
{
struct rpcrdma_regbuf *rb;
- struct ib_sge *iov;
rb = kmalloc(sizeof(*rb) + size, flags);
if (rb == NULL)
- goto out;
+ return ERR_PTR(-ENOMEM);
- iov = &rb->rg_iov;
- iov->addr = ib_dma_map_single(ia->ri_device,
- (void *)rb->rg_base, size,
- DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(ia->ri_device, iov->addr))
- goto out_free;
+ rb->rg_device = NULL;
+ rb->rg_direction = direction;
+ rb->rg_iov.length = size;
- iov->length = size;
- iov->lkey = ia->ri_pd->local_dma_lkey;
- rb->rg_size = size;
- rb->rg_owner = NULL;
return rb;
+}
-out_free:
- kfree(rb);
-out:
- return ERR_PTR(-ENOMEM);
+/**
+ * __rpcrdma_map_regbuf - DMA-map a regbuf
+ * @ia: controlling rpcrdma_ia
+ * @rb: regbuf to be mapped
+ */
+bool
+__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+ if (rb->rg_direction == DMA_NONE)
+ return false;
+
+ rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+ (void *)rb->rg_base,
+ rdmab_length(rb),
+ rb->rg_direction);
+ if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+ return false;
+
+ rb->rg_device = ia->ri_device;
+ rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
+ return true;
+}
+
+static void
+rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
+{
+ if (!rpcrdma_regbuf_is_mapped(rb))
+ return;
+
+ ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
+ rdmab_length(rb), rb->rg_direction);
+ rb->rg_device = NULL;
}
/**
* rpcrdma_free_regbuf - deregister and free registered buffer
- * @ia: controlling rpcrdma_ia
* @rb: regbuf to be deregistered and freed
*/
void
-rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
{
- struct ib_sge *iov;
-
if (!rb)
return;
- iov = &rb->rg_iov;
- ib_dma_unmap_single(ia->ri_device,
- iov->addr, iov->length, DMA_BIDIRECTIONAL);
+ rpcrdma_dma_unmap_regbuf(rb);
kfree(rb);
}
@@ -1248,39 +1306,22 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_ep *ep,
struct rpcrdma_req *req)
{
- struct ib_device *device = ia->ri_device;
- struct ib_send_wr send_wr, *send_wr_fail;
- struct rpcrdma_rep *rep = req->rl_reply;
- struct ib_sge *iov = req->rl_send_iov;
- int i, rc;
+ struct ib_send_wr *send_wr = &req->rl_send_wr;
+ struct ib_send_wr *send_wr_fail;
+ int rc;
- if (rep) {
- rc = rpcrdma_ep_post_recv(ia, ep, rep);
+ if (req->rl_reply) {
+ rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
if (rc)
return rc;
req->rl_reply = NULL;
}
- send_wr.next = NULL;
- send_wr.wr_cqe = &req->rl_cqe;
- send_wr.sg_list = iov;
- send_wr.num_sge = req->rl_niovs;
- send_wr.opcode = IB_WR_SEND;
-
- for (i = 0; i < send_wr.num_sge; i++)
- ib_dma_sync_single_for_device(device, iov[i].addr,
- iov[i].length, DMA_TO_DEVICE);
dprintk("RPC: %s: posting %d s/g entries\n",
- __func__, send_wr.num_sge);
-
- if (DECR_CQCOUNT(ep) > 0)
- send_wr.send_flags = 0;
- else { /* Provider must take a send completion every now and then */
- INIT_CQCOUNT(ep);
- send_wr.send_flags = IB_SEND_SIGNALED;
- }
+ __func__, send_wr->num_sge);
- rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+ rpcrdma_set_signaled(ep, send_wr);
+ rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
if (rc)
goto out_postsend_err;
return 0;
@@ -1290,32 +1331,24 @@ out_postsend_err:
return -ENOTCONN;
}
-/*
- * (Re)post a receive buffer.
- */
int
rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
- struct rpcrdma_ep *ep,
struct rpcrdma_rep *rep)
{
- struct ib_recv_wr recv_wr, *recv_wr_fail;
+ struct ib_recv_wr *recv_wr_fail;
int rc;
- recv_wr.next = NULL;
- recv_wr.wr_cqe = &rep->rr_cqe;
- recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
- recv_wr.num_sge = 1;
-
- ib_dma_sync_single_for_cpu(ia->ri_device,
- rdmab_addr(rep->rr_rdmabuf),
- rdmab_length(rep->rr_rdmabuf),
- DMA_BIDIRECTIONAL);
-
- rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+ if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
+ goto out_map;
+ rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
if (rc)
goto out_postrecv;
return 0;
+out_map:
+ pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
+ return -EIO;
+
out_postrecv:
pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
return -ENOTCONN;
@@ -1333,7 +1366,6 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
{
struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
struct rpcrdma_rep *rep;
int rc;
@@ -1344,7 +1376,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
rep = rpcrdma_buffer_get_rep_locked(buffers);
spin_unlock(&buffers->rb_lock);
- rc = rpcrdma_ep_post_recv(ia, ep, rep);
+ rc = rpcrdma_ep_post_recv(ia, rep);
if (rc)
goto out_rc;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a71b0f5897d8..e35efd4ac1e4 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,9 +70,12 @@ struct rpcrdma_ia {
struct ib_pd *ri_pd;
struct completion ri_done;
int ri_async_rc;
+ unsigned int ri_max_segs;
unsigned int ri_max_frmr_depth;
unsigned int ri_max_inline_write;
unsigned int ri_max_inline_read;
+ bool ri_reminv_expected;
+ enum ib_mr_type ri_mrtype;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -87,13 +90,30 @@ struct rpcrdma_ep {
int rep_connected;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
+ struct rpcrdma_connect_private rep_cm_private;
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
struct delayed_work rep_connect_worker;
};
-#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
-#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+static inline void
+rpcrdma_init_cqcount(struct rpcrdma_ep *ep, int count)
+{
+ atomic_set(&ep->rep_cqcount, ep->rep_cqinit - count);
+}
+
+/* To update send queue accounting, provider must take a
+ * send completion every now and then.
+ */
+static inline void
+rpcrdma_set_signaled(struct rpcrdma_ep *ep, struct ib_send_wr *send_wr)
+{
+ send_wr->send_flags = 0;
+ if (unlikely(atomic_sub_return(1, &ep->rep_cqcount) <= 0)) {
+ rpcrdma_init_cqcount(ep, 0);
+ send_wr->send_flags = IB_SEND_SIGNALED;
+ }
+}
/* Pre-allocate extra Work Requests for handling backward receives
* and sends. This is a fixed value because the Work Queues are
@@ -112,9 +132,9 @@ struct rpcrdma_ep {
*/
struct rpcrdma_regbuf {
- size_t rg_size;
- struct rpcrdma_req *rg_owner;
struct ib_sge rg_iov;
+ struct ib_device *rg_device;
+ enum dma_data_direction rg_direction;
__be32 rg_base[0] __attribute__ ((aligned(256)));
};
@@ -162,7 +182,10 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* The smallest inline threshold is 1024 bytes, ensuring that
* at least 750 bytes are available for RPC messages.
*/
-#define RPCRDMA_MAX_HDR_SEGS (8)
+enum {
+ RPCRDMA_MAX_HDR_SEGS = 8,
+ RPCRDMA_HDRBUF_SIZE = 256,
+};
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
@@ -182,10 +205,13 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
unsigned int rr_len;
+ int rr_wc_flags;
+ u32 rr_inv_rkey;
struct ib_device *rr_device;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
struct list_head rr_list;
+ struct ib_recv_wr rr_recv_wr;
struct rpcrdma_regbuf *rr_rdmabuf;
};
@@ -207,7 +233,8 @@ struct rpcrdma_rep {
enum rpcrdma_frmr_state {
FRMR_IS_INVALID, /* ready to be used */
FRMR_IS_VALID, /* in use */
- FRMR_IS_STALE, /* failed completion */
+ FRMR_FLUSHED_FR, /* flushed FASTREG WR */
+ FRMR_FLUSHED_LI, /* flushed LOCALINV WR */
};
struct rpcrdma_frmr {
@@ -276,19 +303,30 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
char *mr_offset; /* kva if no page, else offset */
};
-#define RPCRDMA_MAX_IOVS (2)
+/* Reserve enough Send SGEs to send a maximum size inline request:
+ * - RPC-over-RDMA header
+ * - xdr_buf head iovec
+ * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages
+ * - xdr_buf tail iovec
+ */
+enum {
+ RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1,
+ RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1,
+ RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
+};
struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_free;
- unsigned int rl_niovs;
+ unsigned int rl_mapped_sges;
unsigned int rl_connect_cookie;
- struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer;
- struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
- struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
- struct rpcrdma_regbuf *rl_rdmabuf;
- struct rpcrdma_regbuf *rl_sendbuf;
+ struct rpcrdma_rep *rl_reply;
+ struct ib_send_wr rl_send_wr;
+ struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES];
+ struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
+ struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */
+ struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */
struct ib_cqe rl_cqe;
struct list_head rl_all;
@@ -298,14 +336,16 @@ struct rpcrdma_req {
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
+static inline void
+rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
+{
+ rqst->rq_xprtdata = req;
+}
+
static inline struct rpcrdma_req *
rpcr_to_rdmar(struct rpc_rqst *rqst)
{
- void *buffer = rqst->rq_buffer;
- struct rpcrdma_regbuf *rb;
-
- rb = container_of(buffer, struct rpcrdma_regbuf, rg_base);
- return rb->rg_owner;
+ return rqst->rq_xprtdata;
}
/*
@@ -356,15 +396,6 @@ struct rpcrdma_create_data_internal {
unsigned int padding; /* non-rdma write header padding */
};
-#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
- (rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
-
-#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
- (rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
-
-#define RPCRDMA_INLINE_PAD_VALUE(rq)\
- rpcx_to_rdmad(rq->rq_xprt).padding
-
/*
* Statistics for RPCRDMA
*/
@@ -386,6 +417,7 @@ struct rpcrdma_stats {
unsigned long mrs_recovered;
unsigned long mrs_orphaned;
unsigned long mrs_allocated;
+ unsigned long local_inv_needed;
};
/*
@@ -409,6 +441,7 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_mw *);
void (*ro_release_mr)(struct rpcrdma_mw *);
const char *ro_displayname;
+ const int ro_send_w_inv_ok;
};
extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
@@ -457,19 +490,19 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
struct rpcrdma_create_data_internal *);
void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_conn_func(struct rpcrdma_ep *ep);
void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
struct rpcrdma_req *);
-int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
- struct rpcrdma_rep *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
/*
* Buffer calls - xprtrdma/verbs.c
*/
struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
-void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *);
+void rpcrdma_destroy_req(struct rpcrdma_req *);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
@@ -482,10 +515,24 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
-struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
- size_t, gfp_t);
-void rpcrdma_free_regbuf(struct rpcrdma_ia *,
- struct rpcrdma_regbuf *);
+struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
+ gfp_t);
+bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
+void rpcrdma_free_regbuf(struct rpcrdma_regbuf *);
+
+static inline bool
+rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
+{
+ return rb->rg_device != NULL;
+}
+
+static inline bool
+rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+ if (likely(rpcrdma_regbuf_is_mapped(rb)))
+ return true;
+ return __rpcrdma_dma_map_regbuf(ia, rb);
+}
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
@@ -503,25 +550,30 @@ rpcrdma_data_dir(bool writing)
}
/*
- * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
- */
-void rpcrdma_connect_worker(struct work_struct *);
-void rpcrdma_conn_func(struct rpcrdma_ep *);
-void rpcrdma_reply_handler(struct rpcrdma_rep *);
-
-/*
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
+
+enum rpcrdma_chunktype {
+ rpcrdma_noch = 0,
+ rpcrdma_readch,
+ rpcrdma_areadch,
+ rpcrdma_writech,
+ rpcrdma_replych
+};
+
+bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
+ u32, struct xdr_buf *, enum rpcrdma_chunktype);
+void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
int rpcrdma_marshal_req(struct rpc_rqst *);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
- struct rpcrdma_create_data_internal *,
- unsigned int);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_reply_handler(struct work_struct *work);
/* RPC/RDMA module init - xprtrdma/transport.c
*/
extern unsigned int xprt_rdma_max_inline_read;
void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void rpcrdma_connect_worker(struct work_struct *work);
void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf168838a029..af392d9b9cec 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -473,7 +473,16 @@ static int xs_nospace(struct rpc_task *task)
spin_unlock_bh(&xprt->transport_lock);
/* Race breaker in case memory is freed before above code is called */
- sk->sk_write_space(sk);
+ if (ret == -EAGAIN) {
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
+ rcu_read_unlock();
+
+ sk->sk_write_space(sk);
+ }
return ret;
}
@@ -1071,10 +1080,10 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
if (sk == NULL)
goto out;
for (;;) {
- skb = skb_recv_datagram(sk, 0, 1, &err);
+ skb = skb_recv_udp(sk, 0, 1, &err);
if (skb != NULL) {
xs_udp_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram_locked(sk, skb);
+ consume_skb(skb);
continue;
}
if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
@@ -2533,35 +2542,39 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
* we allocate pages instead doing a kmalloc like rpc_malloc is because we want
* to use the server side send routines.
*/
-static void *bc_malloc(struct rpc_task *task, size_t size)
+static int bc_malloc(struct rpc_task *task)
{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ size_t size = rqst->rq_callsize;
struct page *page;
struct rpc_buffer *buf;
- WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer));
- if (size > PAGE_SIZE - sizeof(struct rpc_buffer))
- return NULL;
+ if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) {
+ WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n",
+ size);
+ return -EINVAL;
+ }
page = alloc_page(GFP_KERNEL);
if (!page)
- return NULL;
+ return -ENOMEM;
buf = page_address(page);
buf->len = PAGE_SIZE;
- return buf->data;
+ rqst->rq_buffer = buf->data;
+ rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
+ return 0;
}
/*
* Free the space allocated in the bc_alloc routine
*/
-static void bc_free(void *buffer)
+static void bc_free(struct rpc_task *task)
{
+ void *buffer = task->tk_rqstp->rq_buffer;
struct rpc_buffer *buf;
- if (!buffer)
- return;
-
buf = container_of(buffer, struct rpc_buffer, data);
free_page((unsigned long)buf);
}